From 9042fcad49bc16ca9adab700389a7ebd2bfef47d Mon Sep 17 00:00:00 2001 From: Tk-Glitch Date: Thu, 26 Mar 2020 13:41:24 +0100 Subject: [PATCH] linux55-tkg: 5.5.13 - previous index was here: https://github.com/Tk-Glitch/PKGBUILDS --- linux54-tkg/PKGBUILD | 1063 + linux54-tkg/README.md | 38 + linux54-tkg/customization.cfg | 169 + linux54-tkg/linux54-tkg-config/config.x86_64 | 10609 ++++++++++ .../linux54-tkg-config/config_hardened.x86_64 | 10527 +++++++++ .../generic-desktop-profile.cfg | 55 + .../ryzen-desktop-profile.cfg | 58 + ...sallow-unprivileged-CLONE_NEWUSER-by.patch | 156 + .../0002-clear-patches.patch | 354 + .../0003-glitched-base.patch | 5199 +++++ .../0003-glitched-cfs.patch | 72 + .../linux54-tkg-patches/0004-5.4-ck1.patch | 17679 ++++++++++++++++ .../0004-glitched-muqss.patch | 78 + .../0004-glitched-ondemand-muqss.patch | 18 + .../0005-glitched-ondemand-pds.patch | 18 + .../0005-glitched-pds.patch | 213 + .../0005-v5.4_undead-pds099o.patch | 8387 ++++++++ .../0006-add-acs-overrides_iommu.patch | 192 + .../linux54-tkg-patches/0007-v5.4-fsync.patch | 419 + .../0009-bmq_v5.4-r2.patch | 7601 +++++++ .../0009-glitched-bmq.patch | 108 + .../linux54-tkg-patches/0011-ZFS-fix.patch | 43 + .../0012-linux-hardened.patch | 2806 +++ linux55-tkg/PKGBUILD | 1115 + linux55-tkg/README.md | 38 + linux55-tkg/customization.cfg | 175 + linux55-tkg/linux55-tkg-config/config.x86_64 | 10680 ++++++++++ .../generic-desktop-profile.cfg | 48 + .../ryzen-desktop-profile.cfg | 51 + ...sallow-unprivileged-CLONE_NEWUSER-by.patch | 156 + .../0002-clear-patches.patch | 354 + .../0003-glitched-base.patch | 3965 ++++ .../0003-glitched-cfs.patch | 72 + .../linux55-tkg-patches/0004-5.5-ck1.patch | 12598 +++++++++++ .../0004-glitched-muqss.patch | 78 + .../0004-glitched-ondemand-muqss.patch | 18 + .../0005-glitched-ondemand-pds.patch | 18 + .../0005-glitched-pds.patch | 166 + .../0005-v5.5_undead-pds099o.patch | 8369 ++++++++ .../0006-add-acs-overrides_iommu.patch | 192 + .../linux55-tkg-patches/0007-v5.5-fsync.patch | 419 + .../0009-bmq_v5.5-r3.patch | 7624 +++++++ .../0009-glitched-bmq.patch | 90 + .../0009-glitched-ondemand-bmq.patch | 18 + .../linux55-tkg-patches/0011-ZFS-fix.patch | 43 + linux56-rc-tkg/PKGBUILD | 1114 + linux56-rc-tkg/README.md | 36 + linux56-rc-tkg/customization.cfg | 175 + .../linux56-tkg-config/config.x86_64 | 10751 ++++++++++ .../generic-desktop-profile.cfg | 55 + .../ryzen-desktop-profile.cfg | 58 + ...sallow-unprivileged-CLONE_NEWUSER-by.patch | 156 + .../0002-clear-patches.patch | 354 + .../0003-glitched-base.patch | 1459 ++ .../0003-glitched-cfs.patch | 72 + .../linux56-tkg-patches/0004-5.6-ck1.patch | 12561 +++++++++++ .../0004-glitched-muqss.patch | 78 + .../0004-glitched-ondemand-muqss.patch | 18 + .../0005-glitched-ondemand-pds.patch | 18 + .../0005-glitched-pds.patch | 166 + .../0005-v5.6_undead-pds099o.patch | 8369 ++++++++ .../0006-add-acs-overrides_iommu.patch | 192 + .../linux56-tkg-patches/0007-v5.6-fsync.patch | 419 + .../0009-bmq_v5.6-r0.patch | 7624 +++++++ .../0009-glitched-bmq.patch | 90 + .../0009-glitched-ondemand-bmq.patch | 18 + .../linux56-tkg-patches/0011-ZFS-fix.patch | 43 + .../0013-tp_smapi_ec.patch | 2415 +++ 68 files changed, 158390 insertions(+) create mode 100644 linux54-tkg/PKGBUILD create mode 100644 linux54-tkg/README.md create mode 100644 linux54-tkg/customization.cfg create mode 100644 linux54-tkg/linux54-tkg-config/config.x86_64 create mode 100644 linux54-tkg/linux54-tkg-config/config_hardened.x86_64 create mode 100644 linux54-tkg/linux54-tkg-config/generic-desktop-profile.cfg create mode 100644 linux54-tkg/linux54-tkg-config/ryzen-desktop-profile.cfg create mode 100644 linux54-tkg/linux54-tkg-patches/0001-add-sysctl-to-disallow-unprivileged-CLONE_NEWUSER-by.patch create mode 100644 linux54-tkg/linux54-tkg-patches/0002-clear-patches.patch create mode 100644 linux54-tkg/linux54-tkg-patches/0003-glitched-base.patch create mode 100644 linux54-tkg/linux54-tkg-patches/0003-glitched-cfs.patch create mode 100644 linux54-tkg/linux54-tkg-patches/0004-5.4-ck1.patch create mode 100644 linux54-tkg/linux54-tkg-patches/0004-glitched-muqss.patch create mode 100644 linux54-tkg/linux54-tkg-patches/0004-glitched-ondemand-muqss.patch create mode 100644 linux54-tkg/linux54-tkg-patches/0005-glitched-ondemand-pds.patch create mode 100644 linux54-tkg/linux54-tkg-patches/0005-glitched-pds.patch create mode 100644 linux54-tkg/linux54-tkg-patches/0005-v5.4_undead-pds099o.patch create mode 100644 linux54-tkg/linux54-tkg-patches/0006-add-acs-overrides_iommu.patch create mode 100644 linux54-tkg/linux54-tkg-patches/0007-v5.4-fsync.patch create mode 100644 linux54-tkg/linux54-tkg-patches/0009-bmq_v5.4-r2.patch create mode 100644 linux54-tkg/linux54-tkg-patches/0009-glitched-bmq.patch create mode 100644 linux54-tkg/linux54-tkg-patches/0011-ZFS-fix.patch create mode 100644 linux54-tkg/linux54-tkg-patches/0012-linux-hardened.patch create mode 100644 linux55-tkg/PKGBUILD create mode 100644 linux55-tkg/README.md create mode 100644 linux55-tkg/customization.cfg create mode 100644 linux55-tkg/linux55-tkg-config/config.x86_64 create mode 100644 linux55-tkg/linux55-tkg-config/generic-desktop-profile.cfg create mode 100644 linux55-tkg/linux55-tkg-config/ryzen-desktop-profile.cfg create mode 100644 linux55-tkg/linux55-tkg-patches/0001-add-sysctl-to-disallow-unprivileged-CLONE_NEWUSER-by.patch create mode 100644 linux55-tkg/linux55-tkg-patches/0002-clear-patches.patch create mode 100644 linux55-tkg/linux55-tkg-patches/0003-glitched-base.patch create mode 100644 linux55-tkg/linux55-tkg-patches/0003-glitched-cfs.patch create mode 100644 linux55-tkg/linux55-tkg-patches/0004-5.5-ck1.patch create mode 100644 linux55-tkg/linux55-tkg-patches/0004-glitched-muqss.patch create mode 100644 linux55-tkg/linux55-tkg-patches/0004-glitched-ondemand-muqss.patch create mode 100644 linux55-tkg/linux55-tkg-patches/0005-glitched-ondemand-pds.patch create mode 100644 linux55-tkg/linux55-tkg-patches/0005-glitched-pds.patch create mode 100644 linux55-tkg/linux55-tkg-patches/0005-v5.5_undead-pds099o.patch create mode 100644 linux55-tkg/linux55-tkg-patches/0006-add-acs-overrides_iommu.patch create mode 100644 linux55-tkg/linux55-tkg-patches/0007-v5.5-fsync.patch create mode 100644 linux55-tkg/linux55-tkg-patches/0009-bmq_v5.5-r3.patch create mode 100644 linux55-tkg/linux55-tkg-patches/0009-glitched-bmq.patch create mode 100644 linux55-tkg/linux55-tkg-patches/0009-glitched-ondemand-bmq.patch create mode 100644 linux55-tkg/linux55-tkg-patches/0011-ZFS-fix.patch create mode 100644 linux56-rc-tkg/PKGBUILD create mode 100644 linux56-rc-tkg/README.md create mode 100644 linux56-rc-tkg/customization.cfg create mode 100644 linux56-rc-tkg/linux56-tkg-config/config.x86_64 create mode 100644 linux56-rc-tkg/linux56-tkg-config/generic-desktop-profile.cfg create mode 100644 linux56-rc-tkg/linux56-tkg-config/ryzen-desktop-profile.cfg create mode 100644 linux56-rc-tkg/linux56-tkg-patches/0001-add-sysctl-to-disallow-unprivileged-CLONE_NEWUSER-by.patch create mode 100644 linux56-rc-tkg/linux56-tkg-patches/0002-clear-patches.patch create mode 100644 linux56-rc-tkg/linux56-tkg-patches/0003-glitched-base.patch create mode 100644 linux56-rc-tkg/linux56-tkg-patches/0003-glitched-cfs.patch create mode 100644 linux56-rc-tkg/linux56-tkg-patches/0004-5.6-ck1.patch create mode 100644 linux56-rc-tkg/linux56-tkg-patches/0004-glitched-muqss.patch create mode 100644 linux56-rc-tkg/linux56-tkg-patches/0004-glitched-ondemand-muqss.patch create mode 100644 linux56-rc-tkg/linux56-tkg-patches/0005-glitched-ondemand-pds.patch create mode 100644 linux56-rc-tkg/linux56-tkg-patches/0005-glitched-pds.patch create mode 100644 linux56-rc-tkg/linux56-tkg-patches/0005-v5.6_undead-pds099o.patch create mode 100644 linux56-rc-tkg/linux56-tkg-patches/0006-add-acs-overrides_iommu.patch create mode 100644 linux56-rc-tkg/linux56-tkg-patches/0007-v5.6-fsync.patch create mode 100644 linux56-rc-tkg/linux56-tkg-patches/0009-bmq_v5.6-r0.patch create mode 100644 linux56-rc-tkg/linux56-tkg-patches/0009-glitched-bmq.patch create mode 100644 linux56-rc-tkg/linux56-tkg-patches/0009-glitched-ondemand-bmq.patch create mode 100644 linux56-rc-tkg/linux56-tkg-patches/0011-ZFS-fix.patch create mode 100644 linux56-rc-tkg/linux56-tkg-patches/0013-tp_smapi_ec.patch diff --git a/linux54-tkg/PKGBUILD b/linux54-tkg/PKGBUILD new file mode 100644 index 0000000..d3e580d --- /dev/null +++ b/linux54-tkg/PKGBUILD @@ -0,0 +1,1063 @@ +# Based on the file created for Arch Linux by: +# Tobias Powalowski +# Thomas Baechler + +# Contributor: Tk-Glitch + +plain ' .---.` `.---.' +plain ' `/syhhhyso- -osyhhhys/`' +plain ' .syNMdhNNhss/``.---.``/sshNNhdMNys.' +plain ' +sdMh.`+MNsssssssssssssssNM+`.hMds+' +plain ' :syNNdhNNhssssssssssssssshNNhdNNys:' +plain ' /ssyhhhysssssssssssssssssyhhhyss/' +plain ' .ossssssssssssssssssssssssssssso.' +plain ' :sssssssssssssssssssssssssssssssss:' +plain ' /sssssssssssssssssssssssssssssssssss/' +plain ' :sssssssssssssoosssssssoosssssssssssss:' +plain ' osssssssssssssoosssssssoossssssssssssso' +plain ' osssssssssssyyyyhhhhhhhyyyyssssssssssso' +plain ' /yyyyyyhhdmmmmNNNNNNNNNNNmmmmdhhyyyyyy/' +plain ' smmmNNNNNNNNNNNNNNNNNNNNNNNNNNNNNmmms' +plain ' /dNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNd/' +plain ' `:sdNNNNNNNNNNNNNNNNNNNNNNNNNds:`' +plain ' `-+shdNNNNNNNNNNNNNNNdhs+-`' +plain ' `.-:///////:-.`' + +_where=$PWD # track basedir as different Arch based distros are moving srcdir around + +cp "$_where"/linux54-tkg-patches/* "$_where" # copy patches inside the PKGBUILD's dir to preserve makepkg sourcing and md5sum checking +cp "$_where"/linux54-tkg-config/* "$_where" # copy config files and hooks inside the PKGBUILD's dir to preserve makepkg sourcing and md5sum checking + +source "$_where"/customization.cfg # load default configuration from file + +# Load external configuration file if present. Available variable values will overwrite customization.cfg ones. +if [ -e "$_EXT_CONFIG_PATH" ]; then + source "$_EXT_CONFIG_PATH" && msg2 "External configuration file $_EXT_CONFIG_PATH will be used to override customization.cfg values." && msg2 "" +fi + +if [ -z "$_OPTIPROFILE" ] && [ ! -e "$_where"/cpuschedset ]; then + # Prompt about optimized configurations. Available variable values will overwrite customization.cfg/external config ones. + plain "Do you want to use a predefined optimized profile?" + read -rp "`echo $' > 1.Custom\n 2.Ryzen Desktop (Performance)\n 3.Other Desktop (Performance)\nchoice[1-3?]: '`" _OPTIPROFILE; +fi +if [ "$_OPTIPROFILE" == "2" ]; then + source "$_where"/ryzen-desktop-profile.cfg && msg2 "Ryzen Desktop (Performance) profile will be used." && msg2 "" +elif [ "$_OPTIPROFILE" == "3" ]; then + source "$_where"/generic-desktop-profile.cfg && msg2 "Generic Desktop (Performance) profile will be used." && msg2 "" +fi + +# source cpuschedset early if present +if [ -e "$_where"/cpuschedset ]; then + source "$_where"/cpuschedset +fi + +# CPU SCHED selector +if [ -z "$_cpusched" ] && [ ! -e "$_where"/cpuschedset ]; then + plain "What CPU sched variant do you want to build/install?" + read -rp "`echo $' > 1.PDS\n 2.MuQSS\n 3.BMQ\n 4.CFS\nchoice[1-4?]: '`" CONDITION; + if [ "$CONDITION" == "2" ]; then + echo "_cpusched=\"MuQSS\"" > "$_where"/cpuschedset + elif [ "$CONDITION" == "3" ]; then + echo "_cpusched=\"bmq\"" > "$_where"/cpuschedset + elif [ "$CONDITION" == "4" ]; then + echo "_cpusched=\"cfs\"" > "$_where"/cpuschedset + else + echo "_cpusched=\"pds\"" > "$_where"/cpuschedset + fi + if [ -n "$_custom_pkgbase" ]; then + echo "_custom_pkgbase=\"${_custom_pkgbase}\"" >> "$_where"/cpuschedset + fi +elif [ "$_cpusched" == "muqss" ] || [ "$_cpusched" == "MuQSS" ]; then + echo "_cpusched=\"MuQSS\"" > "$_where"/cpuschedset +elif [ "$_cpusched" == "pds" ]; then + echo "_cpusched=\"pds\"" > "$_where"/cpuschedset +elif [ "$_cpusched" == "bmq" ]; then + echo "_cpusched=\"bmq\"" > "$_where"/cpuschedset +else + echo "_cpusched=\"cfs\"" > "$_where"/cpuschedset +fi + +source "$_where"/cpuschedset + +_basever=54 +if [ -n "$_custom_pkgbase" ]; then + pkgbase="${_custom_pkgbase}" +else + pkgbase=linux"${_basever}"-tkg-"${_cpusched}" +fi +pkgname=("${pkgbase}" "${pkgbase}-headers") +_basekernel=5.4 +_sub=28 +pkgver="${_basekernel}"."${_sub}" +pkgrel=42 +pkgdesc='Linux-tkg' +arch=('x86_64') # no i686 in here +url="http://www.kernel.org/" +license=('GPL2') +makedepends=('xmlto' 'docbook-xsl' 'kmod' 'inetutils' 'bc' 'libelf' 'patchutils' 'flex' 'python-sphinx' 'python-sphinx_rtd_theme' 'graphviz' 'imagemagick' 'git') +optdepends=('schedtool') +if pacman -Qq ccache &> /dev/null; then + msg2 'ccache was found and will be used' + options=('!strip' 'ccache') +else + msg2 'ccache was not found and will not be used' + options=('!strip') +fi +source=("https://www.kernel.org/pub/linux/kernel/v5.x/linux-${_basekernel}.tar.xz" + "https://www.kernel.org/pub/linux/kernel/v5.x/patch-${pkgver}.xz" + 'config.x86_64' # stock Arch config + 'config_hardened.x86_64' # hardened Arch config + # ARCH Patches + 0001-add-sysctl-to-disallow-unprivileged-CLONE_NEWUSER-by.patch + # TkG + 0002-clear-patches.patch + 0003-glitched-base.patch + 0003-glitched-cfs.patch + 0004-glitched-ondemand-muqss.patch + 0004-glitched-muqss.patch + 0004-5.4-ck1.patch + 0005-glitched-ondemand-pds.patch + 0005-glitched-pds.patch + 0005-v5.4_undead-pds099o.patch + 0006-add-acs-overrides_iommu.patch + 0007-v5.4-fsync.patch + #0008-5.4-bcachefs.patch + 0009-glitched-bmq.patch + 0009-bmq_v5.4-r2.patch + 0011-ZFS-fix.patch + 0012-linux-hardened.patch +) +sha256sums=('bf338980b1670bca287f9994b7441c2361907635879169c64ae78364efc5f491' + '6965f4c20f73e4707361a69bcb71806901f835dc45fdb8232542947507144fc0' + 'b7f23bbc09b6c571b76f851f0389386a6f3a64f3d7b1b8509c8550228b0f4537' + '1f4a20d6eaaa0d969af93152a65191492400c6aa838fc1c290b0dd29bb6019d8' + '31dc68e84aecfb7d069efb1305049122c65694676be8b955634abcf0675922a2' + 'd02bf5ca08fd610394b9d3a0c3b176d74af206f897dee826e5cbaec97bb4a4aa' + '55db7c30c6d5e7d88cf38a4c757e447faf11a7440afbc081c3da12757073cca7' + '7058e57fd68367b029adc77f2a82928f1433daaf02c8c279cb2d13556c8804d7' + 'c605f638d74c61861ebdc36ebd4cb8b6475eae2f6273e1ccb2bbb3e10a2ec3fe' + 'bc69d6e5ee8172b0242c8fa72d13cfe2b8d2b6601468836908a7dfe8b78a3bbb' + 'e7638aabd2041dd8a03096462f9d5d4aa58680ee0f690703ccb1912047dac4f4' + '62496f9ca788996181ef145f96ad26291282fcc3fb95cdc04080dcf84365be33' + 'eac7e5d6201528e64f4bdf5e286c842511e1afc52e1518dc8e7d11932bbe0a99' + 'db03fbd179ec78941eefe1c0edde4c19071bc603511d0b5c06c04e412994b62e' + '90917e09bb06fbed6853efe9e52f8c2ba4066fca44accdf7608222212561104a' + '2d9260b80b43bbd605cf420d6bd53aa7262103dfd77196ba590ece5600b6dc0d' + '3832f828a9f402b153fc9a6829c5a4eaf6091804bcda3a0423c8e1b57e26420d' + '6a6a736cf1b3513d108bfd36f60baf50bb36b33aec21ab0d0ffad13602b7ff75' + '49262ce4a8089fa70275aad742fc914baa28d9c384f710c9a62f64796d13e104' + 'aeb31404c26ee898d007b1f66cb9572c9884ad8eca14edc4587d68f6cba6de46') + +export KBUILD_BUILD_HOST=archlinux +export KBUILD_BUILD_USER=$pkgbase +export KBUILD_BUILD_TIMESTAMP="$(date -Ru${SOURCE_DATE_EPOCH:+d @$SOURCE_DATE_EPOCH})" + +user_patcher() { + # To patch the user because all your base are belong to us + local _patches=("$_where"/*."${_userpatch_ext}revert") + if [ ${#_patches[@]} -ge 2 ] || [ -e "${_patches}" ]; then + if [ "$_user_patches_no_confirm" != "true" ]; then + msg2 "Found ${#_patches[@]} 'to revert' userpatches for ${_userpatch_target}:" + printf '%s\n' "${_patches[@]}" + read -rp "Do you want to install it/them? - Be careful with that ;)"$'\n> N/y : ' _CONDITION; + fi + if [ "$_CONDITION" == "y" ] || [ "$_user_patches_no_confirm" == "true" ]; then + for _f in "${_patches[@]}"; do + if [ -e "${_f}" ]; then + msg2 "######################################################" + msg2 "" + msg2 "Reverting your own ${_userpatch_target} patch ${_f}" + msg2 "" + msg2 "######################################################" + patch -Np1 -R < "${_f}" + echo "Reverted your own patch ${_f}" >> "$_where"/last_build_config.log + fi + done + fi + fi + + _patches=("$_where"/*."${_userpatch_ext}patch") + if [ ${#_patches[@]} -ge 2 ] || [ -e "${_patches}" ]; then + if [ "$_user_patches_no_confirm" != "true" ]; then + msg2 "Found ${#_patches[@]} userpatches for ${_userpatch_target}:" + printf '%s\n' "${_patches[@]}" + read -rp "Do you want to install it/them? - Be careful with that ;)"$'\n> N/y : ' _CONDITION; + fi + if [ "$_CONDITION" == "y" ] || [ "$_user_patches_no_confirm" == "true" ]; then + for _f in "${_patches[@]}"; do + if [ -e "${_f}" ]; then + msg2 "######################################################" + msg2 "" + msg2 "Applying your own ${_userpatch_target} patch ${_f}" + msg2 "" + msg2 "######################################################" + patch -Np1 < "${_f}" + echo "Applied your own patch ${_f}" >> "$_where"/last_build_config.log + fi + done + fi + fi +} + +prepare() { + rm -rf $pkgdir # Nuke the entire pkg folder so it'll get regenerated clean on next build + + ln -s "${_where}/customization.cfg" "${srcdir}" # workaround + + cd "${srcdir}/linux-${_basekernel}" + + msg2 "Setting version..." + scripts/setlocalversion --save-scmversion + echo "-$pkgrel-tkg-${_cpusched}" > localversion.10-pkgrel + echo "" > localversion.20-pkgname + + # add upstream patch + patch -p1 -i ../patch-"${pkgver}" + + # ARCH Patches + if [ "${_configfile}" == "config_hardened.x86_64" ] && [ "${_cpusched}" == "cfs" ]; then + msg2 "Using linux hardened patchset" + patch -Np1 -i ../0012-linux-hardened.patch + else + patch -Np1 -i ../0001-add-sysctl-to-disallow-unprivileged-CLONE_NEWUSER-by.patch + fi + + # TkG + patch -Np1 -i ../0002-clear-patches.patch + + patch -Np1 -i ../0003-glitched-base.patch + + if [ "${_cpusched}" == "MuQSS" ]; then + # MuQSS + patch -Np1 -i ../0004-5.4-ck1.patch + if [ "${_aggressive_ondemand}" == "true" ]; then + patch -Np1 -i ../0004-glitched-ondemand-muqss.patch + fi + patch -Np1 -i ../0004-glitched-muqss.patch + elif [ "${_cpusched}" == "pds" ]; then + # PDS-mq + patch -Np1 -i ../0005-v5.4_undead-pds099o.patch + if [ "${_aggressive_ondemand}" == "true" ]; then + patch -Np1 -i ../0005-glitched-ondemand-pds.patch + fi + patch -Np1 -i ../0005-glitched-pds.patch + elif [ "${_cpusched}" == "bmq" ]; then + # BMQ + patch -Np1 -i ../0009-bmq_v5.4-r2.patch + patch -Np1 -i ../0009-glitched-bmq.patch + else + patch -Np1 -i ../0003-glitched-cfs.patch + fi + + if [ -z "${_configfile}" ]; then + _configfile="config.x86_64" + fi + + cat "${srcdir}/${_configfile}" > ./.config + + # Set some -tkg defaults + echo "# CONFIG_DYNAMIC_FAULT is not set" >> ./.config + sed -i -e 's/CONFIG_DEFAULT_FQ_CODEL=y/# CONFIG_DEFAULT_FQ_CODEL is not set/' ./.config + echo "CONFIG_DEFAULT_CAKE=y" >> ./.config + echo "CONFIG_NR_TTY_DEVICES=63" >> ./.config + echo "CONFIG_TP_SMAPI=m" >> ./.config + echo "CONFIG_RAID6_USE_PREFER_GEN=y" >> ./.config + echo "# CONFIG_NTP_PPS is not set" >> ./.config + sed -i -e 's/CONFIG_CRYPTO_LZ4=m/CONFIG_CRYPTO_LZ4=y/' ./.config + sed -i -e 's/CONFIG_CRYPTO_LZ4HC=m/CONFIG_CRYPTO_LZ4HC=y/' ./.config + sed -i -e 's/CONFIG_LZ4_COMPRESS=m/CONFIG_LZ4_COMPRESS=y/' ./.config + sed -i -e 's/CONFIG_LZ4HC_COMPRESS=m/CONFIG_LZ4HC_COMPRESS=y/' ./.config + sed -i -e 's/CONFIG_RCU_BOOST_DELAY=500/CONFIG_RCU_BOOST_DELAY=0/' ./.config + + # Inject cpuopts options + echo "# CONFIG_MK8SSE3 is not set" >> ./.config + echo "# CONFIG_MK10 is not set" >> ./.config + echo "# CONFIG_MBARCELONA is not set" >> ./.config + echo "# CONFIG_MBOBCAT is not set" >> ./.config + echo "# CONFIG_MJAGUAR is not set" >> ./.config + echo "# CONFIG_MBULLDOZER is not set" >> ./.config + echo "# CONFIG_MPILEDRIVER is not set" >> ./.config + echo "# CONFIG_MSTEAMROLLER is not set" >> ./.config + echo "# CONFIG_MEXCAVATOR is not set" >> ./.config + echo "# CONFIG_MZEN is not set" >> ./.config + echo "# CONFIG_MZEN2 is not set" >> ./.config + echo "# CONFIG_MATOM is not set" >> ./.config + echo "# CONFIG_MNEHALEM is not set" >> ./.config + echo "# CONFIG_MWESTMERE is not set" >> ./.config + echo "# CONFIG_MSILVERMONT is not set" >> ./.config + echo "# CONFIG_MSANDYBRIDGE is not set" >> ./.config + echo "# CONFIG_MIVYBRIDGE is not set" >> ./.config + echo "# CONFIG_MHASWELL is not set" >> ./.config + echo "# CONFIG_MBROADWELL is not set" >> ./.config + echo "# CONFIG_MSKYLAKE is not set" >> ./.config + echo "# CONFIG_MSKYLAKEX is not set" >> ./.config + echo "# CONFIG_MCANNONLAKE is not set" >> ./.config + echo "# CONFIG_MICELAKE is not set" >> ./.config + + # Disable some debugging + if [ "${_debugdisable}" == "true" ]; then + sed -i -e 's/CONFIG_SLUB_DEBUG=y/# CONFIG_SLUB_DEBUG is not set/' ./.config + sed -i -e 's/CONFIG_PM_DEBUG=y/# CONFIG_PM_DEBUG is not set/' ./.config + sed -i -e 's/CONFIG_PM_ADVANCED_DEBUG=y/# CONFIG_PM_ADVANCED_DEBUG is not set/' ./.config + sed -i -e 's/CONFIG_PM_SLEEP_DEBUG=y/# CONFIG_PM_SLEEP_DEBUG is not set/' ./.config + sed -i -e 's/CONFIG_ACPI_DEBUG=y/# CONFIG_ACPI_DEBUG is not set/' ./.config + sed -i -e 's/CONFIG_SCHED_DEBUG=y/# CONFIG_SCHED_DEBUG is not set/' ./.config + sed -i -e 's/CONFIG_DEBUG_PREEMPT=y/# CONFIG_DEBUG_PREEMPT is not set/' ./.config + fi + + if [ "${_cpusched}" == "MuQSS" ]; then + # MuQSS default config + echo "CONFIG_SCHED_MUQSS=y" >> ./.config + elif [ "${_cpusched}" == "pds" ]; then + # PDS default config + echo "CONFIG_SCHED_PDS=y" >> ./.config + elif [ "${_cpusched}" == "bmq" ]; then + # BMQ default config + echo "CONFIG_SCHED_BMQ=y" >> ./.config + fi + + if [ "${_cpusched}" == "MuQSS" ] || [ "${_cpusched}" == "pds" ] || [ "${_cpusched}" == "bmq" ]; then + # Disable CFS + sed -i -e 's/CONFIG_FAIR_GROUP_SCHED=y/# CONFIG_FAIR_GROUP_SCHED is not set/' ./.config + sed -i -e 's/CONFIG_CFS_BANDWIDTH=y/# CONFIG_CFS_BANDWIDTH is not set/' ./.config + sed -i -e 's/CONFIG_CGROUP_CPUACCT=y/# CONFIG_CGROUP_CPUACCT is not set/' ./.config + sed -i -e 's/CONFIG_SCHED_AUTOGROUP=y/# CONFIG_SCHED_AUTOGROUP is not set/' ./.config + # sched yield type + if [ -n "$_sched_yield_type" ]; then + CONDITION0="$_sched_yield_type" + else + plain "" + plain "CPU sched_yield_type - Choose what sort of yield sched_yield will perform." + plain "" + plain "For PDS and MuQSS:" + plain "0: No yield." + plain "1: Yield only to better priority/deadline tasks." + plain "2: Expire timeslice and recalculate deadline." + plain "" + plain "For BMQ (experimental) - No recommended value yet, so try for yourself x) :" + plain "0: No yield." + plain "1: Deboost and requeue task. (default)" + plain "2: Set rq skip task." + read -rp "`echo $'\n > 0. Recommended option for gaming on PDS and MuQSS - "tkg" default\n 1. Default, but can lead to stability issues on some platforms\n 2. Can be a good option with low rr_interval on MuQSS\n [0-2?]: '`" CONDITION0; + fi + if [ "$CONDITION0" == "1" ]; then + msg2 "Using default CPU sched yield type (1)" + elif [ "$CONDITION0" == "2" ]; then + sed -i -e 's/int sched_yield_type __read_mostly = 1;/int sched_yield_type __read_mostly = 2;/' ./kernel/sched/"${_cpusched}".c + else + sed -i -e 's/int sched_yield_type __read_mostly = 1;/int sched_yield_type __read_mostly = 0;/' ./kernel/sched/"${_cpusched}".c + fi + fi + + # Round Robin interval + if [ "${_cpusched}" == "MuQSS" ] || [ "${_cpusched}" == "pds" ] || [ "${_cpusched}" == "bmq" ]; then + if [ -n "$_rr_interval" ]; then + CONDITION1="$_rr_interval" + else + plain "" + plain "Round Robin interval is the longest duration two tasks with the same nice level will" + plain "be delayed for. When CPU time is requested by a task, it receives a time slice equal" + plain "to the rr_interval in addition to a virtual deadline. When using yield_type 2, a low" + plain "value can help offset the disadvantages of rescheduling a process that has yielded." + plain "" + plain "MuQSS default: 6ms" + plain "PDS default: 4ms" + plain "BMQ default: 2ms" + read -rp "`echo $'\n > 0.Keep defaults\n 1.2ms (worth a shot with MuQSS + yield_type 2)\n 2.4ms\n 3.6ms\n 4.8ms\n [0-4?]: '`" CONDITION1; + fi + if [ "$CONDITION1" == "1" ]; then + msg2 "Using 2ms rr_interval" + _rrvalue="2" + elif [ "$CONDITION1" == "2" ]; then + msg2 "Using 4ms rr_interval" + _rrvalue="4" + elif [ "$CONDITION1" == "3" ]; then + msg2 "Using 6ms rr_interval" + _rrvalue="6" + elif [ "$CONDITION1" == "4" ]; then + msg2 "Using 8ms rr_interval" + _rrvalue="8" + else + msg2 "Using default rr_interval" + _rrvalue="default" + fi + if [ "$_rrvalue" != "default" ]; then + if [ "${_cpusched}" == "MuQSS" ]; then + sed -i -e "s/int rr_interval __read_mostly = 6;/int rr_interval __read_mostly = ${_rrvalue};/" ./kernel/sched/"${_cpusched}".c + elif [ "${_cpusched}" == "pds" ]; then + sed -i -e "s/#define SCHED_DEFAULT_RR (4)/#define SCHED_DEFAULT_RR (${_rrvalue})/" ./kernel/sched/"${_cpusched}".c + elif [ "${_cpusched}" == "bmq" ]; then + echo "CONFIG_SCHED_TIMESLICE=${_rrvalue}" >> ./.config + fi + else + if [ "${_cpusched}" == "bmq" ]; then + echo "CONFIG_SCHED_TIMESLICE=2" >> ./.config + fi + fi + fi + + # zenify + if [ "$_zenify" == "true" ]; then + echo "CONFIG_ZENIFY=y" >> ./.config + elif [ "$_zenify" == "false" ]; then + echo "# CONFIG_ZENIFY is not set" >> ./.config + fi + + # compiler optimization level + if [ "$_compileroptlevel" == "1" ]; then + echo "# CONFIG_CC_OPTIMIZE_HARDER is not set" >> ./.config + elif [ "$_compileroptlevel" == "2" ]; then + sed -i -e 's/CONFIG_CC_OPTIMIZE_FOR_PERFORMANCE=y/# CONFIG_CC_OPTIMIZE_FOR_PERFORMANCE is not set/' ./.config + echo "CONFIG_CC_OPTIMIZE_HARDER=y" >> ./.config + elif [ "$_compileroptlevel" == "3" ]; then + sed -i -e 's/CONFIG_CC_OPTIMIZE_FOR_PERFORMANCE=y/# CONFIG_CC_OPTIMIZE_FOR_PERFORMANCE is not set/' ./.config + sed -i -e 's/# CONFIG_CC_OPTIMIZE_FOR_SIZE is not set/CONFIG_CC_OPTIMIZE_FOR_SIZE=y/' ./.config + echo "# CONFIG_CC_OPTIMIZE_HARDER is not set" >> ./.config + fi + + # cpu opt + if [ -n "$_processor_opt" ] && [ "$_processor_opt" != "native" ]; then + echo "# CONFIG_MNATIVE is not set" >> ./.config + fi + + if [ -n "$_processor_opt" ] && [ "$_processor_opt" != "generic" ]; then + sed -i -e 's/CONFIG_GENERIC_CPU=y/# CONFIG_GENERIC_CPU is not set/' ./.config + fi + + if [ "$_processor_opt" == "native" ]; then + echo "CONFIG_MNATIVE=y" >> ./.config + elif [ "$_processor_opt" == "k8" ]; then + sed -i -e 's/# CONFIG_MK8 is not set/CONFIG_MK8=y/' ./.config + elif [ "$_processor_opt" == "k8sse3" ]; then + sed -i -e 's/# CONFIG_MK8SSE3 is not set/CONFIG_MK8SSE3=y/' ./.config + elif [ "$_processor_opt" == "k10" ]; then + sed -i -e 's/# CONFIG_MK10 is not set/CONFIG_MK10=y/' ./.config + elif [ "$_processor_opt" == "barcelona" ]; then + sed -i -e 's/# CONFIG_MBARCELONA is not set/CONFIG_MBARCELONA=y/' ./.config + elif [ "$_processor_opt" == "bobcat" ]; then + sed -i -e 's/# CONFIG_MBOBCAT is not set/CONFIG_MBOBCAT=y/' ./.config + elif [ "$_processor_opt" == "jaguar" ]; then + sed -i -e 's/# CONFIG_MJAGUAR is not set/CONFIG_MJAGUAR=y/' ./.config + elif [ "$_processor_opt" == "bulldozer" ]; then + sed -i -e 's/# CONFIG_MBULLDOZER is not set/CONFIG_MBULLDOZER=y/' ./.config + elif [ "$_processor_opt" == "piledriver" ]; then + sed -i -e 's/# CONFIG_MPILEDRIVER is not set/CONFIG_MPILEDRIVER=y/' ./.config + elif [ "$_processor_opt" == "steamroller" ]; then + sed -i -e 's/# CONFIG_MSTEAMROLLER is not set/CONFIG_MSTEAMROLLER=y/' ./.config + elif [ "$_processor_opt" == "excavator" ]; then + sed -i -e 's/# CONFIG_MEXCAVATOR is not set/CONFIG_MEXCAVATOR=y/' ./.config + elif [ "$_processor_opt" == "zen" ]; then + sed -i -e 's/# CONFIG_MZEN is not set/CONFIG_MZEN=y/' ./.config + elif [ "$_processor_opt" == "zen2" ]; then + sed -i -e 's/# CONFIG_MZEN2 is not set/CONFIG_MZEN2=y/' ./.config + elif [ "$_processor_opt" == "mpsc" ]; then + sed -i -e 's/# CONFIG_MPSC is not set/CONFIG_MPSC=y/' ./.config + elif [ "$_processor_opt" == "atom" ]; then + sed -i -e 's/# CONFIG_MATOM is not set/CONFIG_MATOM=y/' ./.config + elif [ "$_processor_opt" == "core2" ]; then + sed -i -e 's/# CONFIG_MCORE2 is not set/CONFIG_MCORE2=y/' ./.config + elif [ "$_processor_opt" == "nehalem" ]; then + sed -i -e 's/# CONFIG_MNEHALEM is not set/CONFIG_MNEHALEM=y/' ./.config + elif [ "$_processor_opt" == "westmere" ]; then + sed -i -e 's/# CONFIG_MWESTMERE is not set/CONFIG_MWESTMERE=y/' ./.config + elif [ "$_processor_opt" == "silvermont" ]; then + sed -i -e 's/# CONFIG_MSILVERMONT is not set/CONFIG_MSILVERMONT=y/' ./.config + elif [ "$_processor_opt" == "sandybridge" ]; then + sed -i -e 's/# CONFIG_MSANDYBRIDGE is not set/CONFIG_MSANDYBRIDGE=y/' ./.config + elif [ "$_processor_opt" == "ivybridge" ]; then + sed -i -e 's/# CONFIG_MIVYBRIDGE is not set/CONFIG_MIVYBRIDGE=y/' ./.config + elif [ "$_processor_opt" == "haswell" ]; then + sed -i -e 's/# CONFIG_MHASWELL is not set/CONFIG_MHASWELL=y/' ./.config + elif [ "$_processor_opt" == "broadwell" ]; then + sed -i -e 's/# CONFIG_MBROADWELL is not set/CONFIG_MBROADWELL=y/' ./.config + elif [ "$_processor_opt" == "skylake" ]; then + sed -i -e 's/# CONFIG_MSKYLAKE is not set/CONFIG_MSKYLAKE=y/' ./.config + elif [ "$_processor_opt" == "skylakex" ]; then + sed -i -e 's/# CONFIG_MSKYLAKEX is not set/CONFIG_MSKYLAKEX=y/' ./.config + elif [ "$_processor_opt" == "cannonlake" ]; then + sed -i -e 's/# CONFIG_MCANNONLAKE is not set/CONFIG_MCANNONLAKE=y/' ./.config + elif [ "$_processor_opt" == "icelake" ]; then + sed -i -e 's/# CONFIG_MICELAKE is not set/CONFIG_MICELAKE=y/' ./.config + fi + + # irq threading + if [ "$_irq_threading" == "true" ]; then + echo "CONFIG_FORCE_IRQ_THREADING=y" >> ./.config + elif [ "$_irq_threading" == "false" ]; then + echo "# CONFIG_FORCE_IRQ_THREADING is not set" >> ./.config + fi + + # smt nice + if [ "$_smt_nice" == "true" ]; then + echo "CONFIG_SMT_NICE=y" >> ./.config + elif [ "$_smt_nice" == "false" ]; then + echo "# CONFIG_SMT_NICE is not set" >> ./.config + fi + + # random trust cpu + if [ "$_random_trust_cpu" == "true" ]; then + sed -i -e 's/# CONFIG_RANDOM_TRUST_CPU is not set/CONFIG_RANDOM_TRUST_CPU=y/' ./.config + fi + + # rq sharing + if [ "$_runqueue_sharing" == "none" ]; then + echo -e "CONFIG_RQ_NONE=y\n# CONFIG_RQ_SMT is not set\n# CONFIG_RQ_MC is not set\n# CONFIG_RQ_MC_LLC is not set\n# CONFIG_RQ_SMP is not set\n# CONFIG_RQ_ALL is not set" >> ./.config + elif [ "$_runqueue_sharing" == "smt" ]; then + echo -e "# CONFIG_RQ_NONE is not set\nCONFIG_RQ_SMT=y\n# CONFIG_RQ_MC is not set\n# CONFIG_RQ_MC_LLC is not set\n# CONFIG_RQ_SMP is not set\n# CONFIG_RQ_ALL is not set" >> ./.config + elif [ "$_runqueue_sharing" == "mc" ]; then + echo -e "# CONFIG_RQ_NONE is not set\n# CONFIG_RQ_SMT is not set\nCONFIG_RQ_MC=y\n# CONFIG_RQ_MC_LLC is not set\n# CONFIG_RQ_SMP is not set\n# CONFIG_RQ_ALL is not set" >> ./.config + elif [ "$_runqueue_sharing" == "smp" ]; then + echo -e "# CONFIG_RQ_NONE is not set\n# CONFIG_RQ_SMT is not set\n# CONFIG_RQ_MC is not set\n# CONFIG_RQ_MC_LLC is not set\nCONFIG_RQ_SMP=y\n# CONFIG_RQ_ALL is not set" >> ./.config + elif [ "$_runqueue_sharing" == "all" ]; then + echo -e "# CONFIG_RQ_NONE is not set\n# CONFIG_RQ_SMT is not set\n# CONFIG_RQ_MC is not set\n# CONFIG_RQ_MC_LLC is not set\n# CONFIG_RQ_SMP is not set\nCONFIG_RQ_ALL=y" >> ./.config + elif [ "$_runqueue_sharing" == "mc-llc" ]; then + echo -e "# CONFIG_RQ_NONE is not set\n# CONFIG_RQ_SMT is not set\n# CONFIG_RQ_MC is not set\nCONFIG_RQ_MC_LLC=y\n# CONFIG_RQ_SMP is not set\n# CONFIG_RQ_ALL is not set" >> ./.config + fi + + # timer freq + if [ -n "$_timer_freq" ] && [ "$_timer_freq" != "300" ]; then + sed -i -e 's/CONFIG_HZ_300=y/# CONFIG_HZ_300 is not set/' ./.config + sed -i -e 's/CONFIG_HZ_300_NODEF=y/# CONFIG_HZ_300_NODEF is not set/' ./.config + if [ "$_timer_freq" == "1000" ]; then + sed -i -e 's/# CONFIG_HZ_1000 is not set/CONFIG_HZ_1000=y/' ./.config + sed -i -e 's/CONFIG_HZ=300/CONFIG_HZ=1000/' ./.config + echo "# CONFIG_HZ_500 is not set" >> ./.config + echo "# CONFIG_HZ_500_NODEF is not set" >> ./.config + echo "# CONFIG_HZ_750 is not set" >> ./.config + echo "# CONFIG_HZ_750_NODEF is not set" >> ./.config + echo "CONFIG_HZ_1000_NODEF=y" >> ./.config + echo "# CONFIG_HZ_250_NODEF is not set" >> ./.config + echo "# CONFIG_HZ_300_NODEF is not set" >> ./.config + elif [ "$_timer_freq" == "750" ]; then + sed -i -e 's/CONFIG_HZ=300/CONFIG_HZ=750/' ./.config + echo "# CONFIG_HZ_500 is not set" >> ./.config + echo "# CONFIG_HZ_500_NODEF is not set" >> ./.config + echo "CONFIG_HZ_750=y" >> ./.config + echo "CONFIG_HZ_750_NODEF=y" >> ./.config + echo "# CONFIG_HZ_1000_NODEF is not set" >> ./.config + echo "# CONFIG_HZ_250_NODEF is not set" >> ./.config + echo "# CONFIG_HZ_300_NODEF is not set" >> ./.config + elif [ "$_timer_freq" == "500" ]; then + sed -i -e 's/CONFIG_HZ=300/CONFIG_HZ=500/' ./.config + echo "CONFIG_HZ_500=y" >> ./.config + echo "CONFIG_HZ_500_NODEF=y" >> ./.config + echo "# CONFIG_HZ_750 is not set" >> ./.config + echo "# CONFIG_HZ_750_NODEF is not set" >> ./.config + echo "# CONFIG_HZ_1000_NODEF is not set" >> ./.config + echo "# CONFIG_HZ_250_NODEF is not set" >> ./.config + echo "# CONFIG_HZ_300_NODEF is not set" >> ./.config + fi + else + sed -i -e 's/CONFIG_HZ_300=y/# CONFIG_HZ_300 is not set/' ./.config + sed -i -e 's/CONFIG_HZ_300_NODEF=y/# CONFIG_HZ_300_NODEF is not set/' ./.config + sed -i -e 's/CONFIG_HZ=300/CONFIG_HZ=500/' ./.config + echo "CONFIG_HZ_500=y" >> ./.config + echo "CONFIG_HZ_500_NODEF=y" >> ./.config + echo "# CONFIG_HZ_250_NODEF is not set" >> ./.config + echo "# CONFIG_HZ_300_NODEF is not set" >> ./.config + fi + + # default cpu gov + if [ "$_default_cpu_gov" == "performance" ]; then + sed -i -e 's/CONFIG_CPU_FREQ_DEFAULT_GOV_SCHEDUTIL=y/# CONFIG_CPU_FREQ_DEFAULT_GOV_SCHEDUTIL is not set/' ./.config + sed -i -e 's/# CONFIG_CPU_FREQ_DEFAULT_GOV_PERFORMANCE is not set/CONFIG_CPU_FREQ_DEFAULT_GOV_PERFORMANCE=y/' ./.config + elif [ "$_default_cpu_gov" == "ondemand" ]; then + sed -i -e 's/CONFIG_CPU_FREQ_DEFAULT_GOV_SCHEDUTIL=y/# CONFIG_CPU_FREQ_DEFAULT_GOV_SCHEDUTIL is not set/' ./.config + sed -i -e 's/# CONFIG_CPU_FREQ_DEFAULT_GOV_ONDEMAND is not set/CONFIG_CPU_FREQ_DEFAULT_GOV_ONDEMAND=y/' ./.config + fi + + # ACPI_CPUFREQ disablement + if [ "$_disable_acpi_cpufreq" == "true" ]; then + sed -i -e 's/CONFIG_X86_ACPI_CPUFREQ=m/# CONFIG_X86_ACPI_CPUFREQ is not set/' ./.config + fi + + # ftrace + if [ -z "$_ftracedisable" ]; then + plain "" + plain "Disable FUNCTION_TRACER/GRAPH_TRACER? Lowers overhead but limits debugging" + plain "and analyzing of kernel functions." + read -rp "`echo $' > N/y : '`" CONDITION2; + fi + if [ "$CONDITION2" == "y" ] || [ "$_ftracedisable" == "true" ]; then + sed -i -e 's/CONFIG_FUNCTION_TRACER=y/# CONFIG_FUNCTION_TRACER is not set/' ./.config + sed -i -e 's/CONFIG_FUNCTION_GRAPH_TRACER=y/# CONFIG_FUNCTION_GRAPH_TRACER is not set/' ./.config + fi + + # disable numa + if [ -z "$_numadisable" ]; then + plain "" + plain "Disable NUMA? Lowers overhead, but breaks CUDA/NvEnc on Nvidia if disabled." + plain "https://bbs.archlinux.org/viewtopic.php?id=239174" + read -rp "`echo $' > N/y : '`" CONDITION3; + fi + if [ "$CONDITION3" == "y" ] || [ "$_numadisable" == "true" ]; then + # disable NUMA since 99.9% of users do not have multiple CPUs but do have multiple cores in one CPU + sed -i -e 's/CONFIG_NUMA=y/# CONFIG_NUMA is not set/' \ + -i -e '/CONFIG_AMD_NUMA=y/d' \ + -i -e '/CONFIG_X86_64_ACPI_NUMA=y/d' \ + -i -e '/CONFIG_NODES_SPAN_OTHER_NODES=y/d' \ + -i -e '/# CONFIG_NUMA_EMU is not set/d' \ + -i -e '/CONFIG_NODES_SHIFT=6/d' \ + -i -e '/CONFIG_NEED_MULTIPLE_NODES=y/d' \ + -i -e '/CONFIG_USE_PERCPU_NUMA_NODE_ID=y/d' \ + -i -e '/CONFIG_ACPI_NUMA=y/d' ./.config + fi + + # tickless + if [ -z "$_tickless" ]; then + plain "" + plain "Use CattaRappa mode (Tickless/Dynticks) ?" + plain "Can give higher performances in many cases but lower consistency on some hardware." + plain "Just tickless idle can perform better on some platforms (mostly AMD based)." + read -rp "`echo $'\n 0.No, use periodic ticks\n > 1.Yes, full tickless baby!\n 2.Just tickless idle plz\n [0-2?]: '`" CONDITION4; + fi + if [ "$CONDITION4" == "0" ] || [ "$_tickless" == "0" ]; then + echo "# CONFIG_NO_HZ_FULL_NODEF is not set" >> ./.config + sed -i -e 's/# CONFIG_HZ_PERIODIC is not set/CONFIG_HZ_PERIODIC=y/' ./.config + sed -i -e 's/CONFIG_NO_HZ_IDLE=y/# CONFIG_NO_HZ_IDLE is not set/' ./.config + sed -i -e 's/CONFIG_NO_HZ_FULL=y/# CONFIG_NO_HZ_FULL is not set/' ./.config + sed -i -e 's/CONFIG_NO_HZ=y/# CONFIG_NO_HZ is not set/' ./.config + sed -i -e 's/CONFIG_NO_HZ_COMMON=y/# CONFIG_NO_HZ_COMMON is not set/' ./.config + elif [ "$CONDITION4" == "2" ] || [ "$_tickless" == "2" ]; then + echo "# CONFIG_NO_HZ_FULL_NODEF is not set" >> ./.config + sed -i -e 's/CONFIG_HZ_PERIODIC=y/# CONFIG_HZ_PERIODIC is not set/' ./.config + sed -i -e 's/# CONFIG_NO_HZ_IDLE is not set/CONFIG_NO_HZ_IDLE=y/' ./.config + sed -i -e 's/CONFIG_NO_HZ_FULL=y/# CONFIG_NO_HZ_FULL is not set/' ./.config + sed -i -e 's/# CONFIG_NO_HZ is not set/CONFIG_NO_HZ=y/' ./.config + sed -i -e 's/# CONFIG_NO_HZ_COMMON is not set/CONFIG_NO_HZ_COMMON=y/' ./.config + else + echo "CONFIG_NO_HZ_FULL_NODEF=y" >> ./.config + sed -i -e 's/CONFIG_HZ_PERIODIC=y/# CONFIG_HZ_PERIODIC is not set/' ./.config + sed -i -e 's/CONFIG_NO_HZ_IDLE=y/# CONFIG_NO_HZ_IDLE is not set/' ./.config + sed -i -e 's/# CONFIG_NO_HZ_FULL is not set/CONFIG_NO_HZ_FULL=y/' ./.config + sed -i -e 's/# CONFIG_NO_HZ is not set/CONFIG_NO_HZ=y/' ./.config + sed -i -e 's/# CONFIG_NO_HZ_COMMON is not set/CONFIG_NO_HZ_COMMON=y/' ./.config + echo "CONFIG_CONTEXT_TRACKING=y" >> ./.config + echo "# CONFIG_CONTEXT_TRACKING_FORCE is not set" >> ./.config + fi + + # voluntary preempt + if [ -z "$_voluntary_preempt" ]; then + plain "" + plain "Use explicit preemption points?" + plain "It can improve latency on PDS (at the cost of throughput)" + plain "and improve throughput on other schedulers (at the cost of latency)" + read -rp "`echo $' > N/y : '`" CONDITION5; + fi + if [ "$CONDITION5" == "y" ] || [ "$_voluntary_preempt" == "true" ]; then + sed -i -e 's/CONFIG_PREEMPT=y/# CONFIG_PREEMPT is not set/' ./.config + sed -i -e 's/CONFIG_PREEMPT_LL=y/# CONFIG_PREEMPT_LL is not set/' ./.config + sed -i -e 's/# CONFIG_PREEMPT_VOLUNTARY is not set/CONFIG_PREEMPT_VOLUNTARY=y/' ./.config + fi + + # Open Firmware support + if [ -z "$_OFenable" ]; then + plain "" + plain "Enable Device Tree and Open Firmware support?" + read -rp "`echo $' > N/y : '`" CONDITION6; + fi + if [ "$CONDITION6" == "y" ] || [ "$_OFenable" == "true" ]; then + sed -i -e 's/# CONFIG_OF is not set/CONFIG_OF=y/' ./.config + fi + + # acs override + if [ -z "$_acs_override" ]; then + plain "" + plain "Use ACS override patch?" + plain "https://wiki.archlinux.org/index.php/PCI_passthrough_via_OVMF#Bypassing_the_IOMMU_groups_.28ACS_override_patch.29" + read -rp "`echo $' > N/y : '`" CONDITION7; + fi + if [ "$CONDITION7" == "y" ] || [ "$_acs_override" == "true" ]; then + patch -Np1 -i ../0006-add-acs-overrides_iommu.patch + fi + + # bcachefs +# if [ -z "$_bcachefs" ]; then +# plain "" +# plain "Add Bcache filesystem support? You'll have to install bcachefs-tools-git from AUR for utilities." +# plain "https://bcachefs.org/" +# read -rp "`echo $' > N/y : '`" CONDITION8; +# fi +# if [ "$CONDITION8" == "y" ] || [ "$_bcachefs" == "true" ]; then +# patch -Np1 -i ../0008-5.4-bcachefs.patch +# echo "CONFIG_BCACHEFS_FS=m" >> ./.config +# echo "CONFIG_BCACHEFS_QUOTA=y" >> ./.config +# echo "CONFIG_BCACHEFS_POSIX_ACL=y" >> ./.config +# echo "# CONFIG_BCACHEFS_DEBUG is not set" >> ./.config +# echo "# CONFIG_BCACHEFS_TESTS is not set" >> ./.config +# echo "# CONFIG_DEBUG_CLOSURES is not set" >> ./.config +# fi + + # fsync support + if [ -z "$_fsync" ]; then + plain "" + plain "Enable support for fsync, an experimental replacement for esync in Valve Proton 4.11+" + plain "https://steamcommunity.com/games/221410/announcements/detail/2957094910196249305" + read -rp "`echo $' > N/y : '`" CONDITION9; + fi + if [ "$CONDITION9" == "y" ] || [ "$_fsync" == "true" ]; then + patch -Np1 -i ../0007-v5.4-fsync.patch + fi + + # ZFS fix + if [ -z "$_zfsfix" ]; then + plain "" + plain "Add back missing symbol for AES-NI/AVX support on ZFS" + plain "https://github.com/NixOS/nixpkgs/blob/master/pkgs/os-specific/linux/kernel/export_kernel_fpu_functions_5_3.patch" + read -rp "`echo $' > N/y : '`" CONDITION11; + fi + if [ "$CONDITION11" == "y" ] || [ "$_zfsfix" == "true" ]; then + patch -Np1 -i ../0011-ZFS-fix.patch + fi + + # Community patches + if [ -n "$_community_patches" ]; then + _community_patches=($_community_patches) + for _p in ${_community_patches[@]}; do + ln -s "$_where"/../community-patches/linux54-tkg/$_p "$_where"/ + done + fi + + # userpatches + if [ "$_user_patches" == "true" ]; then + _userpatch_target="linux-${_basekernel}" + _userpatch_ext="my" + user_patcher + fi + + # Community patches removal + for _p in ${_community_patches[@]}; do + rm -f "$_where"/$_p + done + + # don't run depmod on 'make install'. We'll do this ourselves in packaging + sed -i '2iexit 0' scripts/depmod.sh + + # get kernel version + make prepare + + # modprobed-db + if [ -z "$_modprobeddb" ]; then + plain "" + plain "Use modprobed db to clean config from unneeded modules?" + plain "Speeds up compilation considerably. Requires root." + plain "https://wiki.archlinux.org/index.php/Modprobed-db" + plain "!!!! Make sure to have a well populated db !!!!" + read -rp "`echo $' > N/y : '`" CONDITIONMPDB; + fi + if [ "$CONDITIONMPDB" == "y" ] || [ "$_modprobeddb" == "true" ]; then + sudo modprobed-db recall + make localmodconfig + fi + + if [ true = "$_config_fragments" ]; then + local fragments=() + mapfile -d '' -t fragments < <(find "$_where" -type f -name "*.myfrag" -print0) + + if [ true = "$_config_fragments_no_confirm" ]; then + printf 'Using config fragment %s\n' "${fragments[@]#$_where/}" + else + for i in "${!fragments[@]}"; do + while true; do + read -r -p 'Found config fragment '"${fragments[$i]#$_where/}"', apply it? [y/N] ' CONDITIONMPDB + CONDITIONMPDB="$(printf '%s' "$CONDITIONMPDB" | tr '[:upper:]' '[:lower:]')" + case "$CONDITIONMPDB" in + y|yes) + break;; + n|no|'') + unset fragments[$i] + break;; + *) + echo 'Please answer with yes or no' + esac + done + done + fi + + if [ 0 -lt "${#fragments[@]}" ]; then + scripts/kconfig/merge_config.sh -m .config "${fragments[@]}" + fi + fi + + # menuconfig / nconfig + if [ -z "$_menunconfig" ]; then + plain "" + plain "*Optional* For advanced users - Do you want to use make menuconfig or nconfig" + plain "to configure the kernel before building it?" + plain "If you do, make sure your terminal is currently" + plain "at least 19 lines by 80 columns large or you'll get an error :D" + read -rp "`echo $' > 0. nope\n 1. menuconfig\n 2. nconfig\n choice[0-2?]: '`" CONDITIONMNC; + _menunconfig="$CONDITIONMNC" + fi + if [ 1 = "$_menunconfig" ]; then + cp .config .config.orig + make menuconfig + elif [ 2 = "$_menunconfig" ]; then + cp .config .config.orig + make nconfig + else + # rewrite configuration + yes "" | make config >/dev/null + fi + if [ 1 = "$_menunconfig" ] || [ 2 = "$_menunconfig" ]; then + if [ -z "${_diffconfig}" ]; then + while true; do + read -r -p 'Generate a config fragment from your changes? [y/N] ' CONDITIONF + CONDITIONF="$(printf '%s' "$CONDITIONF" | tr '[:upper:]' '[:lower:]')" + case "$CONDITIONF" in + y|yes) + _diffconfig=true + break;; + n|no|'') + _diffconfig=false + break;; + *) + echo 'Please answer with yes or no' + esac + done + fi + if [ true = "$_diffconfig" ]; then + if [ -z "$_diffconfig_name" ]; then + IFS= read -r -p 'Filename for the config fragment [leave empty to not generate fragment]: ' _diffconfig_name + fi + if [ -z "$_diffconfig_name" ]; then + echo 'No file name given, not generating config fragment.' + else ( + prev_pwd="${PWD:-$(pwd)}" + cd "$_where" + "${prev_pwd}/scripts/diffconfig" -m "${prev_pwd}/.config.orig" "${prev_pwd}/.config" > "$_diffconfig_name" + ) fi + fi + rm .config.orig + fi + + make -s kernelrelease > version + msg2 "Prepared %s version %s" "$pkgbase" "$(&1 ) 3>&1 1>&2 2>&3 ) || _runtime=$( time ( make $_force_all_threads LOCALVERSION= bzImage modules 2>&1 ) 3>&1 1>&2 2>&3 ) +} + +hackbase() { + pkgdesc="The $pkgdesc kernel and modules" + depends=('coreutils' 'kmod' 'initramfs') + optdepends=('linux-docs: Kernel hackers manual - HTML documentation that comes with the Linux kernel.' + 'crda: to set the correct wireless channels of your country.' + 'modprobed-db: Keeps track of EVERY kernel module that has ever been probed. Useful for make localmodconfig.' + 'nvidia-tkg: NVIDIA drivers for all installed kernels - non-dkms version.' + 'nvidia-dkms-tkg: NVIDIA drivers for all installed kernels - dkms version.' + 'update-grub: Simple wrapper around grub-mkconfig.') + provides=("linux=${pkgver}" "${pkgbase}") + + cd "${srcdir}/linux-${_basekernel}" + + # get kernel version + local _kernver="$( +From: Serge Hallyn +Date: Fri, 31 May 2013 19:12:12 +0100 +Subject: [PATCH] add sysctl to disallow unprivileged CLONE_NEWUSER by default + +Signed-off-by: Serge Hallyn +[bwh: Remove unneeded binary sysctl bits] +Signed-off-by: Daniel Micay +--- + kernel/fork.c | 15 +++++++++++++++ + kernel/sysctl.c | 12 ++++++++++++ + kernel/user_namespace.c | 3 +++ + 3 files changed, 30 insertions(+) + +diff --git a/kernel/fork.c b/kernel/fork.c +index 07cc743698d3668e..4011d68a8ff9305c 100644 +--- a/kernel/fork.c ++++ b/kernel/fork.c +@@ -102,6 +102,11 @@ + + #define CREATE_TRACE_POINTS + #include ++#ifdef CONFIG_USER_NS ++extern int unprivileged_userns_clone; ++#else ++#define unprivileged_userns_clone 0 ++#endif + + /* + * Minimum number of threads to boot the kernel +@@ -1555,6 +1560,10 @@ static __latent_entropy struct task_struct *copy_process( + if ((clone_flags & (CLONE_NEWUSER|CLONE_FS)) == (CLONE_NEWUSER|CLONE_FS)) + return ERR_PTR(-EINVAL); + ++ if ((clone_flags & CLONE_NEWUSER) && !unprivileged_userns_clone) ++ if (!capable(CAP_SYS_ADMIN)) ++ return ERR_PTR(-EPERM); ++ + /* + * Thread groups must share signals as well, and detached threads + * can only be started up within the thread group. +@@ -2348,6 +2357,12 @@ SYSCALL_DEFINE1(unshare, unsigned long, unshare_flags) + if (unshare_flags & CLONE_NEWNS) + unshare_flags |= CLONE_FS; + ++ if ((unshare_flags & CLONE_NEWUSER) && !unprivileged_userns_clone) { ++ err = -EPERM; ++ if (!capable(CAP_SYS_ADMIN)) ++ goto bad_unshare_out; ++ } ++ + err = check_unshare_flags(unshare_flags); + if (err) + goto bad_unshare_out; +diff --git a/kernel/sysctl.c b/kernel/sysctl.c +index b86520ed3fb60fbf..f7dab3760839f1a1 100644 +--- a/kernel/sysctl.c ++++ b/kernel/sysctl.c +@@ -105,6 +105,9 @@ extern int core_uses_pid; + extern char core_pattern[]; + extern unsigned int core_pipe_limit; + #endif ++#ifdef CONFIG_USER_NS ++extern int unprivileged_userns_clone; ++#endif + extern int pid_max; + extern int pid_max_min, pid_max_max; + extern int percpu_pagelist_fraction; +@@ -513,6 +516,15 @@ static struct ctl_table kern_table[] = { + .proc_handler = proc_dointvec, + }, + #endif ++#ifdef CONFIG_USER_NS ++ { ++ .procname = "unprivileged_userns_clone", ++ .data = &unprivileged_userns_clone, ++ .maxlen = sizeof(int), ++ .mode = 0644, ++ .proc_handler = proc_dointvec, ++ }, ++#endif + #ifdef CONFIG_PROC_SYSCTL + { + .procname = "tainted", +diff --git a/kernel/user_namespace.c b/kernel/user_namespace.c +index c490f1e4313b998a..dd03bd39d7bf194d 100644 +--- a/kernel/user_namespace.c ++++ b/kernel/user_namespace.c +@@ -24,6 +24,9 @@ + #include + #include + ++/* sysctl */ ++int unprivileged_userns_clone; ++ + static struct kmem_cache *user_ns_cachep __read_mostly; + static DEFINE_MUTEX(userns_state_mutex); + +-- +2.15.1 + +From b5202296055dd333db4425120d3f93ef4e6a0573 Mon Sep 17 00:00:00 2001 +From: "Jan Alexander Steffens (heftig)" +Date: Thu, 7 Dec 2017 13:50:48 +0100 +Subject: ZEN: Add CONFIG for unprivileged_userns_clone + +This way our default behavior continues to match the vanilla kernel. +--- + init/Kconfig | 16 ++++++++++++++++ + kernel/user_namespace.c | 4 ++++ + 2 files changed, 20 insertions(+) + +diff --git a/init/Kconfig b/init/Kconfig +index 4592bf7997c0..f3df02990aff 100644 +--- a/init/Kconfig ++++ b/init/Kconfig +@@ -1004,6 +1004,22 @@ config USER_NS + + If unsure, say N. + ++config USER_NS_UNPRIVILEGED ++ bool "Allow unprivileged users to create namespaces" ++ default y ++ depends on USER_NS ++ help ++ When disabled, unprivileged users will not be able to create ++ new namespaces. Allowing users to create their own namespaces ++ has been part of several recent local privilege escalation ++ exploits, so if you need user namespaces but are ++ paranoid^Wsecurity-conscious you want to disable this. ++ ++ This setting can be overridden at runtime via the ++ kernel.unprivileged_userns_clone sysctl. ++ ++ If unsure, say Y. ++ + config PID_NS + bool "PID Namespaces" + default y +diff --git a/kernel/user_namespace.c b/kernel/user_namespace.c +index 6b9dbc257e34..107b17f0d528 100644 +--- a/kernel/user_namespace.c ++++ b/kernel/user_namespace.c +@@ -27,7 +27,11 @@ + #include + + /* sysctl */ ++#ifdef CONFIG_USER_NS_UNPRIVILEGED ++int unprivileged_userns_clone = 1; ++#else + int unprivileged_userns_clone; ++#endif + + static struct kmem_cache *user_ns_cachep __read_mostly; + static DEFINE_MUTEX(userns_state_mutex); diff --git a/linux54-tkg/linux54-tkg-patches/0002-clear-patches.patch b/linux54-tkg/linux54-tkg-patches/0002-clear-patches.patch new file mode 100644 index 0000000..a7c9d4a --- /dev/null +++ b/linux54-tkg/linux54-tkg-patches/0002-clear-patches.patch @@ -0,0 +1,354 @@ +From 2ac70785613ef4c6b16414986bb18bd7b60d2a13 Mon Sep 17 00:00:00 2001 +From: Arjan van de Ven +Date: Mon, 14 Mar 2016 11:10:58 -0600 +Subject: [PATCH] pci pme wakeups + +Reduce wakeups for PME checks, which are a workaround for miswired +boards (sadly, too many of them) in laptops. +--- + drivers/pci/pci.c | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +diff --git a/drivers/pci/pci.c b/drivers/pci/pci.c +index c25acace7d91..0ddebdad9f5b 100644 +--- a/drivers/pci/pci.c ++++ b/drivers/pci/pci.c +@@ -61,7 +61,7 @@ struct pci_pme_device { + struct pci_dev *dev; + }; + +-#define PME_TIMEOUT 1000 /* How long between PME checks */ ++#define PME_TIMEOUT 4000 /* How long between PME checks */ + + static void pci_dev_d3_sleep(struct pci_dev *dev) + { +-- +2.20.1 + +From 7e7e36c67aa71d6a1ec5676d99d37c1fea389ceb Mon Sep 17 00:00:00 2001 +From: Arjan van de Ven +Date: Sat, 19 Mar 2016 21:32:19 -0400 +Subject: [PATCH] intel_idle: tweak cpuidle cstates + +Increase target_residency in cpuidle cstate + +Tune intel_idle to be a bit less agressive; +Clear linux is cleaner in hygiene (wakupes) than the average linux, +so we can afford changing these in a way that increases +performance while keeping power efficiency +--- + drivers/idle/intel_idle.c | 44 +++++++++++++++++++-------------------- + 1 file changed, 22 insertions(+), 22 deletions(-) + +diff --git a/drivers/idle/intel_idle.c b/drivers/idle/intel_idle.c +index 8b5d85c91e9d..5e2d813a048d 100644 +--- a/drivers/idle/intel_idle.c ++++ b/drivers/idle/intel_idle.c +@@ -466,7 +466,7 @@ static struct cpuidle_state hsw_cstates[] = { + .desc = "MWAIT 0x01", + .flags = MWAIT2flg(0x01), + .exit_latency = 10, +- .target_residency = 20, ++ .target_residency = 120, + .enter = &intel_idle, + .enter_s2idle = intel_idle_s2idle, }, + { +@@ -474,7 +474,7 @@ static struct cpuidle_state hsw_cstates[] = { + .desc = "MWAIT 0x10", + .flags = MWAIT2flg(0x10) | CPUIDLE_FLAG_TLB_FLUSHED, + .exit_latency = 33, +- .target_residency = 100, ++ .target_residency = 900, + .enter = &intel_idle, + .enter_s2idle = intel_idle_s2idle, }, + { +@@ -482,7 +482,7 @@ static struct cpuidle_state hsw_cstates[] = { + .desc = "MWAIT 0x20", + .flags = MWAIT2flg(0x20) | CPUIDLE_FLAG_TLB_FLUSHED, + .exit_latency = 133, +- .target_residency = 400, ++ .target_residency = 1000, + .enter = &intel_idle, + .enter_s2idle = intel_idle_s2idle, }, + { +@@ -490,7 +490,7 @@ static struct cpuidle_state hsw_cstates[] = { + .desc = "MWAIT 0x32", + .flags = MWAIT2flg(0x32) | CPUIDLE_FLAG_TLB_FLUSHED, + .exit_latency = 166, +- .target_residency = 500, ++ .target_residency = 1500, + .enter = &intel_idle, + .enter_s2idle = intel_idle_s2idle, }, + { +@@ -498,7 +498,7 @@ static struct cpuidle_state hsw_cstates[] = { + .desc = "MWAIT 0x40", + .flags = MWAIT2flg(0x40) | CPUIDLE_FLAG_TLB_FLUSHED, + .exit_latency = 300, +- .target_residency = 900, ++ .target_residency = 2000, + .enter = &intel_idle, + .enter_s2idle = intel_idle_s2idle, }, + { +@@ -506,7 +506,7 @@ static struct cpuidle_state hsw_cstates[] = { + .desc = "MWAIT 0x50", + .flags = MWAIT2flg(0x50) | CPUIDLE_FLAG_TLB_FLUSHED, + .exit_latency = 600, +- .target_residency = 1800, ++ .target_residency = 5000, + .enter = &intel_idle, + .enter_s2idle = intel_idle_s2idle, }, + { +@@ -514,7 +514,7 @@ static struct cpuidle_state hsw_cstates[] = { + .desc = "MWAIT 0x60", + .flags = MWAIT2flg(0x60) | CPUIDLE_FLAG_TLB_FLUSHED, + .exit_latency = 2600, +- .target_residency = 7700, ++ .target_residency = 9000, + .enter = &intel_idle, + .enter_s2idle = intel_idle_s2idle, }, + { +@@ -534,7 +534,7 @@ static struct cpuidle_state bdw_cstates[] = { + .desc = "MWAIT 0x01", + .flags = MWAIT2flg(0x01), + .exit_latency = 10, +- .target_residency = 20, ++ .target_residency = 120, + .enter = &intel_idle, + .enter_s2idle = intel_idle_s2idle, }, + { +@@ -542,7 +542,7 @@ static struct cpuidle_state bdw_cstates[] = { + .desc = "MWAIT 0x10", + .flags = MWAIT2flg(0x10) | CPUIDLE_FLAG_TLB_FLUSHED, + .exit_latency = 40, +- .target_residency = 100, ++ .target_residency = 1000, + .enter = &intel_idle, + .enter_s2idle = intel_idle_s2idle, }, + { +@@ -550,7 +550,7 @@ static struct cpuidle_state bdw_cstates[] = { + .desc = "MWAIT 0x20", + .flags = MWAIT2flg(0x20) | CPUIDLE_FLAG_TLB_FLUSHED, + .exit_latency = 133, +- .target_residency = 400, ++ .target_residency = 1000, + .enter = &intel_idle, + .enter_s2idle = intel_idle_s2idle, }, + { +@@ -558,7 +558,7 @@ static struct cpuidle_state bdw_cstates[] = { + .desc = "MWAIT 0x32", + .flags = MWAIT2flg(0x32) | CPUIDLE_FLAG_TLB_FLUSHED, + .exit_latency = 166, +- .target_residency = 500, ++ .target_residency = 2000, + .enter = &intel_idle, + .enter_s2idle = intel_idle_s2idle, }, + { +@@ -566,7 +566,7 @@ static struct cpuidle_state bdw_cstates[] = { + .desc = "MWAIT 0x40", + .flags = MWAIT2flg(0x40) | CPUIDLE_FLAG_TLB_FLUSHED, + .exit_latency = 300, +- .target_residency = 900, ++ .target_residency = 4000, + .enter = &intel_idle, + .enter_s2idle = intel_idle_s2idle, }, + { +@@ -574,7 +574,7 @@ static struct cpuidle_state bdw_cstates[] = { + .desc = "MWAIT 0x50", + .flags = MWAIT2flg(0x50) | CPUIDLE_FLAG_TLB_FLUSHED, + .exit_latency = 600, +- .target_residency = 1800, ++ .target_residency = 7000, + .enter = &intel_idle, + .enter_s2idle = intel_idle_s2idle, }, + { +@@ -582,7 +582,7 @@ static struct cpuidle_state bdw_cstates[] = { + .desc = "MWAIT 0x60", + .flags = MWAIT2flg(0x60) | CPUIDLE_FLAG_TLB_FLUSHED, + .exit_latency = 2600, +- .target_residency = 7700, ++ .target_residency = 9000, + .enter = &intel_idle, + .enter_s2idle = intel_idle_s2idle, }, + { +@@ -603,7 +603,7 @@ static struct cpuidle_state skl_cstates[] = { + .desc = "MWAIT 0x01", + .flags = MWAIT2flg(0x01), + .exit_latency = 10, +- .target_residency = 20, ++ .target_residency = 120, + .enter = &intel_idle, + .enter_s2idle = intel_idle_s2idle, }, + { +@@ -611,7 +611,7 @@ static struct cpuidle_state skl_cstates[] = { + .desc = "MWAIT 0x10", + .flags = MWAIT2flg(0x10) | CPUIDLE_FLAG_TLB_FLUSHED, + .exit_latency = 70, +- .target_residency = 100, ++ .target_residency = 1000, + .enter = &intel_idle, + .enter_s2idle = intel_idle_s2idle, }, + { +@@ -619,7 +619,7 @@ static struct cpuidle_state skl_cstates[] = { + .desc = "MWAIT 0x20", + .flags = MWAIT2flg(0x20) | CPUIDLE_FLAG_TLB_FLUSHED, + .exit_latency = 85, +- .target_residency = 200, ++ .target_residency = 600, + .enter = &intel_idle, + .enter_s2idle = intel_idle_s2idle, }, + { +@@ -627,7 +627,7 @@ static struct cpuidle_state skl_cstates[] = { + .desc = "MWAIT 0x33", + .flags = MWAIT2flg(0x33) | CPUIDLE_FLAG_TLB_FLUSHED, + .exit_latency = 124, +- .target_residency = 800, ++ .target_residency = 3000, + .enter = &intel_idle, + .enter_s2idle = intel_idle_s2idle, }, + { +@@ -635,7 +635,7 @@ static struct cpuidle_state skl_cstates[] = { + .desc = "MWAIT 0x40", + .flags = MWAIT2flg(0x40) | CPUIDLE_FLAG_TLB_FLUSHED, + .exit_latency = 200, +- .target_residency = 800, ++ .target_residency = 3200, + .enter = &intel_idle, + .enter_s2idle = intel_idle_s2idle, }, + { +@@ -643,7 +643,7 @@ static struct cpuidle_state skl_cstates[] = { + .desc = "MWAIT 0x50", + .flags = MWAIT2flg(0x50) | CPUIDLE_FLAG_TLB_FLUSHED, + .exit_latency = 480, +- .target_residency = 5000, ++ .target_residency = 9000, + .enter = &intel_idle, + .enter_s2idle = intel_idle_s2idle, }, + { +@@ -651,7 +651,7 @@ static struct cpuidle_state skl_cstates[] = { + .desc = "MWAIT 0x60", + .flags = MWAIT2flg(0x60) | CPUIDLE_FLAG_TLB_FLUSHED, + .exit_latency = 890, +- .target_residency = 5000, ++ .target_residency = 9000, + .enter = &intel_idle, + .enter_s2idle = intel_idle_s2idle, }, + { +@@ -672,7 +672,7 @@ static struct cpuidle_state skx_cstates[] = { + .desc = "MWAIT 0x01", + .flags = MWAIT2flg(0x01), + .exit_latency = 10, +- .target_residency = 20, ++ .target_residency = 300, + .enter = &intel_idle, + .enter_s2idle = intel_idle_s2idle, }, + { +-- +2.20.1 + +From b8211d4f79dd88dfc2d4bd52be46103ea0b70e3e Mon Sep 17 00:00:00 2001 +From: Arjan van de Ven +Date: Fri, 6 Jan 2017 15:34:09 +0000 +Subject: [PATCH] ipv4/tcp: allow the memory tuning for tcp to go a little + bigger than default + +--- + net/ipv4/tcp.c | 4 ++-- + 1 file changed, 2 insertions(+), 2 deletions(-) + +diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c +index cf3c5095c10e..b30d51837b2d 100644 +--- a/net/ipv4/tcp.c ++++ b/net/ipv4/tcp.c +@@ -3897,8 +3897,8 @@ void __init tcp_init(void) + tcp_init_mem(); + /* Set per-socket limits to no more than 1/128 the pressure threshold */ + limit = nr_free_buffer_pages() << (PAGE_SHIFT - 7); +- max_wshare = min(4UL*1024*1024, limit); +- max_rshare = min(6UL*1024*1024, limit); ++ max_wshare = min(16UL*1024*1024, limit); ++ max_rshare = min(16UL*1024*1024, limit); + + init_net.ipv4.sysctl_tcp_wmem[0] = SK_MEM_QUANTUM; + init_net.ipv4.sysctl_tcp_wmem[1] = 16*1024; +-- +2.20.1 + +From 050223869257b87e22636158a80da38d877248ed Mon Sep 17 00:00:00 2001 +From: Arjan van de Ven +Date: Sun, 18 Feb 2018 23:35:41 +0000 +Subject: [PATCH] locking: rwsem: spin faster + +tweak rwsem owner spinning a bit +--- + kernel/locking/rwsem.c | 4 +++- + 1 file changed, 3 insertions(+), 1 deletion(-) + +diff --git a/kernel/locking/rwsem.c b/kernel/locking/rwsem.c +index eef04551eae7..1ec5ab4c8ff7 100644 +--- a/kernel/locking/rwsem.c ++++ b/kernel/locking/rwsem.c +@@ -720,6 +720,7 @@ rwsem_spin_on_owner(struct rw_semaphore *sem, unsigned long nonspinnable) + struct task_struct *new, *owner; + unsigned long flags, new_flags; + enum owner_state state; ++ int i = 0; + + owner = rwsem_owner_flags(sem, &flags); + state = rwsem_owner_state(owner, flags, nonspinnable); +@@ -753,7 +754,8 @@ rwsem_spin_on_owner(struct rw_semaphore *sem, unsigned long nonspinnable) + break; + } + +- cpu_relax(); ++ if (i++ > 1000) ++ cpu_relax(); + } + rcu_read_unlock(); + +From b836ea320114643d4354b43acb6ec8bb06ada487 Mon Sep 17 00:00:00 2001 +From: Arjan van de Ven +Date: Thu, 2 Jun 2016 23:36:32 -0500 +Subject: [PATCH] drivers: Initialize ata before graphics + +ATA init is the long pole in the boot process, and its asynchronous. +move the graphics init after it so that ata and graphics initialize +in parallel +--- + drivers/Makefile | 15 ++++++++------- + 1 file changed, 8 insertions(+), 7 deletions(-) + +diff --git a/drivers/Makefile b/drivers/Makefile +index aaef17cc6512..d08f3a394929 100644 +--- a/drivers/Makefile ++++ b/drivers/Makefile +@@ -58,15 +58,8 @@ obj-y += char/ + # iommu/ comes before gpu as gpu are using iommu controllers + obj-y += iommu/ + +-# gpu/ comes after char for AGP vs DRM startup and after iommu +-obj-y += gpu/ +- + obj-$(CONFIG_CONNECTOR) += connector/ + +-# i810fb and intelfb depend on char/agp/ +-obj-$(CONFIG_FB_I810) += video/fbdev/i810/ +-obj-$(CONFIG_FB_INTEL) += video/fbdev/intelfb/ +- + obj-$(CONFIG_PARPORT) += parport/ + obj-$(CONFIG_NVM) += lightnvm/ + obj-y += base/ block/ misc/ mfd/ nfc/ +@@ -79,6 +72,14 @@ obj-$(CONFIG_IDE) += ide/ + obj-y += scsi/ + obj-y += nvme/ + obj-$(CONFIG_ATA) += ata/ ++ ++# gpu/ comes after char for AGP vs DRM startup and after iommu ++obj-y += gpu/ ++ ++# i810fb and intelfb depend on char/agp/ ++obj-$(CONFIG_FB_I810) += video/fbdev/i810/ ++obj-$(CONFIG_FB_INTEL) += video/fbdev/intelfb/ ++ + obj-$(CONFIG_TARGET_CORE) += target/ + obj-$(CONFIG_MTD) += mtd/ + obj-$(CONFIG_SPI) += spi/ diff --git a/linux54-tkg/linux54-tkg-patches/0003-glitched-base.patch b/linux54-tkg/linux54-tkg-patches/0003-glitched-base.patch new file mode 100644 index 0000000..36cfd14 --- /dev/null +++ b/linux54-tkg/linux54-tkg-patches/0003-glitched-base.patch @@ -0,0 +1,5199 @@ +From f7f49141a5dbe9c99d78196b58c44307fb2e6be3 Mon Sep 17 00:00:00 2001 +From: Tk-Glitch +Date: Wed, 4 Jul 2018 04:30:08 +0200 +Subject: glitched + +diff --git a/scripts/mkcompile_h b/scripts/mkcompile_h +index 87f1fc9..b3be470 100755 +--- a/scripts/mkcompile_h ++++ b/scripts/mkcompile_h +@@ -50,8 +50,8 @@ else + fi + + UTS_VERSION="#$VERSION" +-CONFIG_FLAGS="" +-if [ -n "$SMP" ] ; then CONFIG_FLAGS="SMP"; fi ++CONFIG_FLAGS="TKG" ++if [ -n "$SMP" ] ; then CONFIG_FLAGS="$CONFIG_FLAGS SMP"; fi + if [ -n "$PREEMPT" ] ; then CONFIG_FLAGS="$CONFIG_FLAGS PREEMPT"; fi + UTS_VERSION="$UTS_VERSION $CONFIG_FLAGS $TIMESTAMP" + +diff --git a/arch/x86/Kconfig.cpu b/arch/x86/Kconfig.cpu +index 6adce15268bd..f8612b07d32f 100644 +--- a/arch/x86/Kconfig.cpu ++++ b/arch/x86/Kconfig.cpu +@@ -116,6 +116,7 @@ config MPENTIUMM + config MPENTIUM4 + bool "Pentium-4/Celeron(P4-based)/Pentium-4 M/older Xeon" + depends on X86_32 ++ select X86_P6_NOP + ---help--- + Select this for Intel Pentium 4 chips. This includes the + Pentium 4, Pentium D, P4-based Celeron and Xeon, and +@@ -148,9 +149,8 @@ config MPENTIUM4 + -Paxville + -Dempsey + +- + config MK6 +- bool "K6/K6-II/K6-III" ++ bool "AMD K6/K6-II/K6-III" + depends on X86_32 + ---help--- + Select this for an AMD K6-family processor. Enables use of +@@ -158,7 +158,7 @@ config MK6 + flags to GCC. + + config MK7 +- bool "Athlon/Duron/K7" ++ bool "AMD Athlon/Duron/K7" + depends on X86_32 + ---help--- + Select this for an AMD Athlon K7-family processor. Enables use of +@@ -166,12 +166,83 @@ config MK7 + flags to GCC. + + config MK8 +- bool "Opteron/Athlon64/Hammer/K8" ++ bool "AMD Opteron/Athlon64/Hammer/K8" + ---help--- + Select this for an AMD Opteron or Athlon64 Hammer-family processor. + Enables use of some extended instructions, and passes appropriate + optimization flags to GCC. + ++config MK8SSE3 ++ bool "AMD Opteron/Athlon64/Hammer/K8 with SSE3" ++ ---help--- ++ Select this for improved AMD Opteron or Athlon64 Hammer-family processors. ++ Enables use of some extended instructions, and passes appropriate ++ optimization flags to GCC. ++ ++config MK10 ++ bool "AMD 61xx/7x50/PhenomX3/X4/II/K10" ++ ---help--- ++ Select this for an AMD 61xx Eight-Core Magny-Cours, Athlon X2 7x50, ++ Phenom X3/X4/II, Athlon II X2/X3/X4, or Turion II-family processor. ++ Enables use of some extended instructions, and passes appropriate ++ optimization flags to GCC. ++ ++config MBARCELONA ++ bool "AMD Barcelona" ++ ---help--- ++ Select this for AMD Family 10h Barcelona processors. ++ ++ Enables -march=barcelona ++ ++config MBOBCAT ++ bool "AMD Bobcat" ++ ---help--- ++ Select this for AMD Family 14h Bobcat processors. ++ ++ Enables -march=btver1 ++ ++config MJAGUAR ++ bool "AMD Jaguar" ++ ---help--- ++ Select this for AMD Family 16h Jaguar processors. ++ ++ Enables -march=btver2 ++ ++config MBULLDOZER ++ bool "AMD Bulldozer" ++ ---help--- ++ Select this for AMD Family 15h Bulldozer processors. ++ ++ Enables -march=bdver1 ++ ++config MPILEDRIVER ++ bool "AMD Piledriver" ++ ---help--- ++ Select this for AMD Family 15h Piledriver processors. ++ ++ Enables -march=bdver2 ++ ++config MSTEAMROLLER ++ bool "AMD Steamroller" ++ ---help--- ++ Select this for AMD Family 15h Steamroller processors. ++ ++ Enables -march=bdver3 ++ ++config MEXCAVATOR ++ bool "AMD Excavator" ++ ---help--- ++ Select this for AMD Family 15h Excavator processors. ++ ++ Enables -march=bdver4 ++ ++config MZEN ++ bool "AMD Zen" ++ ---help--- ++ Select this for AMD Family 17h Zen processors. ++ ++ Enables -march=znver1 ++ + config MCRUSOE + bool "Crusoe" + depends on X86_32 +@@ -253,6 +324,7 @@ config MVIAC7 + + config MPSC + bool "Intel P4 / older Netburst based Xeon" ++ select X86_P6_NOP + depends on X86_64 + ---help--- + Optimize for Intel Pentium 4, Pentium D and older Nocona/Dempsey +@@ -262,8 +334,19 @@ config MPSC + using the cpu family field + in /proc/cpuinfo. Family 15 is an older Xeon, Family 6 a newer one. + ++config MATOM ++ bool "Intel Atom" ++ select X86_P6_NOP ++ ---help--- ++ ++ Select this for the Intel Atom platform. Intel Atom CPUs have an ++ in-order pipelining architecture and thus can benefit from ++ accordingly optimized code. Use a recent GCC with specific Atom ++ support in order to fully benefit from selecting this option. ++ + config MCORE2 +- bool "Core 2/newer Xeon" ++ bool "Intel Core 2" ++ select X86_P6_NOP + ---help--- + + Select this for Intel Core 2 and newer Core 2 Xeons (Xeon 51xx and +@@ -271,14 +354,106 @@ config MCORE2 + family in /proc/cpuinfo. Newer ones have 6 and older ones 15 + (not a typo) + +-config MATOM +- bool "Intel Atom" ++ Enables -march=core2 ++ ++config MNEHALEM ++ bool "Intel Nehalem" ++ select X86_P6_NOP + ---help--- + +- Select this for the Intel Atom platform. Intel Atom CPUs have an +- in-order pipelining architecture and thus can benefit from +- accordingly optimized code. Use a recent GCC with specific Atom +- support in order to fully benefit from selecting this option. ++ Select this for 1st Gen Core processors in the Nehalem family. ++ ++ Enables -march=nehalem ++ ++config MWESTMERE ++ bool "Intel Westmere" ++ select X86_P6_NOP ++ ---help--- ++ ++ Select this for the Intel Westmere formerly Nehalem-C family. ++ ++ Enables -march=westmere ++ ++config MSILVERMONT ++ bool "Intel Silvermont" ++ select X86_P6_NOP ++ ---help--- ++ ++ Select this for the Intel Silvermont platform. ++ ++ Enables -march=silvermont ++ ++config MSANDYBRIDGE ++ bool "Intel Sandy Bridge" ++ select X86_P6_NOP ++ ---help--- ++ ++ Select this for 2nd Gen Core processors in the Sandy Bridge family. ++ ++ Enables -march=sandybridge ++ ++config MIVYBRIDGE ++ bool "Intel Ivy Bridge" ++ select X86_P6_NOP ++ ---help--- ++ ++ Select this for 3rd Gen Core processors in the Ivy Bridge family. ++ ++ Enables -march=ivybridge ++ ++config MHASWELL ++ bool "Intel Haswell" ++ select X86_P6_NOP ++ ---help--- ++ ++ Select this for 4th Gen Core processors in the Haswell family. ++ ++ Enables -march=haswell ++ ++config MBROADWELL ++ bool "Intel Broadwell" ++ select X86_P6_NOP ++ ---help--- ++ ++ Select this for 5th Gen Core processors in the Broadwell family. ++ ++ Enables -march=broadwell ++ ++config MSKYLAKE ++ bool "Intel Skylake" ++ select X86_P6_NOP ++ ---help--- ++ ++ Select this for 6th Gen Core processors in the Skylake family. ++ ++ Enables -march=skylake ++ ++config MSKYLAKEX ++ bool "Intel Skylake X" ++ select X86_P6_NOP ++ ---help--- ++ ++ Select this for 6th Gen Core processors in the Skylake X family. ++ ++ Enables -march=skylake-avx512 ++ ++config MCANNONLAKE ++ bool "Intel Cannon Lake" ++ select X86_P6_NOP ++ ---help--- ++ ++ Select this for 8th Gen Core processors ++ ++ Enables -march=cannonlake ++ ++config MICELAKE ++ bool "Intel Ice Lake" ++ select X86_P6_NOP ++ ---help--- ++ ++ Select this for 8th Gen Core processors in the Ice Lake family. ++ ++ Enables -march=icelake + + config GENERIC_CPU + bool "Generic-x86-64" +@@ -287,6 +462,19 @@ config GENERIC_CPU + Generic x86-64 CPU. + Run equally well on all x86-64 CPUs. + ++config MNATIVE ++ bool "Native optimizations autodetected by GCC" ++ ---help--- ++ ++ GCC 4.2 and above support -march=native, which automatically detects ++ the optimum settings to use based on your processor. -march=native ++ also detects and applies additional settings beyond -march specific ++ to your CPU, (eg. -msse4). Unless you have a specific reason not to ++ (e.g. distcc cross-compiling), you should probably be using ++ -march=native rather than anything listed below. ++ ++ Enables -march=native ++ + endchoice + + config X86_GENERIC +@@ -311,7 +499,7 @@ config X86_INTERNODE_CACHE_SHIFT + config X86_L1_CACHE_SHIFT + int + default "7" if MPENTIUM4 || MPSC +- default "6" if MK7 || MK8 || MPENTIUMM || MCORE2 || MATOM || MVIAC7 || X86_GENERIC || GENERIC_CPU ++ default "6" if MK7 || MK8 || MK8SSE3 || MK10 || MBARCELONA || MBOBCAT || MBULLDOZER || MPILEDRIVER || MSTEAMROLLER || MEXCAVATOR || MZEN || MJAGUAR || MPENTIUMM || MCORE2 || MNEHALEM || MWESTMERE || MSILVERMONT || MSANDYBRIDGE || MIVYBRIDGE || MHASWELL || MBROADWELL || MSKYLAKE || MSKYLAKEX || MCANNONLAKE || MICELAKE || MNATIVE || MATOM || MVIAC7 || X86_GENERIC || GENERIC_CPU + default "4" if MELAN || M486 || MGEODEGX1 + default "5" if MWINCHIP3D || MWINCHIPC6 || MCRUSOE || MEFFICEON || MCYRIXIII || MK6 || MPENTIUMIII || MPENTIUMII || M686 || M586MMX || M586TSC || M586 || MVIAC3_2 || MGEODE_LX + +@@ -329,35 +517,36 @@ config X86_ALIGNMENT_16 + + config X86_INTEL_USERCOPY + def_bool y +- depends on MPENTIUM4 || MPENTIUMM || MPENTIUMIII || MPENTIUMII || M586MMX || X86_GENERIC || MK8 || MK7 || MEFFICEON || MCORE2 ++ depends on MPENTIUM4 || MPENTIUMM || MPENTIUMIII || MPENTIUMII || M586MMX || X86_GENERIC || MK8 || MK8SSE3 || MK7 || MEFFICEON || MCORE2 || MK10 || MBARCELONA || MNEHALEM || MWESTMERE || MSILVERMONT || MSANDYBRIDGE || MIVYBRIDGE || MHASWELL || MBROADWELL || MSKYLAKE || MSKYLAKEX || MCANNONLAKE || MICELAKE || MNATIVE + + config X86_USE_PPRO_CHECKSUM + def_bool y +- depends on MWINCHIP3D || MWINCHIPC6 || MCYRIXIII || MK7 || MK6 || MPENTIUM4 || MPENTIUMM || MPENTIUMIII || MPENTIUMII || M686 || MK8 || MVIAC3_2 || MVIAC7 || MEFFICEON || MGEODE_LX || MCORE2 || MATOM ++ depends on MWINCHIP3D || MWINCHIPC6 || MCYRIXIII || MK7 || MK6 || MK10 || MPENTIUM4 || MPENTIUMM || MPENTIUMIII || MPENTIUMII || M686 || MK8 || MK8SSE3 || MVIAC3_2 || MVIAC7 || MEFFICEON || MGEODE_LX || MCORE2 || MNEHALEM || MWESTMERE || MSILVERMONT || MSANDYBRIDGE || MIVYBRIDGE || MHASWELL || MBROADWELL || MSKYLAKE || MSKYLAKEX || MCANNONLAKE || MICELAKE || MATOM || MNATIVE + + config X86_USE_3DNOW + def_bool y + depends on (MCYRIXIII || MK7 || MGEODE_LX) && !UML + +-# +-# P6_NOPs are a relatively minor optimization that require a family >= +-# 6 processor, except that it is broken on certain VIA chips. +-# Furthermore, AMD chips prefer a totally different sequence of NOPs +-# (which work on all CPUs). In addition, it looks like Virtual PC +-# does not understand them. +-# +-# As a result, disallow these if we're not compiling for X86_64 (these +-# NOPs do work on all x86-64 capable chips); the list of processors in +-# the right-hand clause are the cores that benefit from this optimization. +-# + config X86_P6_NOP +- def_bool y +- depends on X86_64 +- depends on (MCORE2 || MPENTIUM4 || MPSC) ++ default n ++ bool "Support for P6_NOPs on Intel chips" ++ depends on (MCORE2 || MPENTIUM4 || MPSC || MATOM || MNEHALEM || MWESTMERE || MSILVERMONT || MSANDYBRIDGE || MIVYBRIDGE || MHASWELL || MBROADWELL || MSKYLAKE || MSKYLAKEX || MCANNONLAKE || MICELAKE || MNATIVE) ++ ---help--- ++ P6_NOPs are a relatively minor optimization that require a family >= ++ 6 processor, except that it is broken on certain VIA chips. ++ Furthermore, AMD chips prefer a totally different sequence of NOPs ++ (which work on all CPUs). In addition, it looks like Virtual PC ++ does not understand them. ++ ++ As a result, disallow these if we're not compiling for X86_64 (these ++ NOPs do work on all x86-64 capable chips); the list of processors in ++ the right-hand clause are the cores that benefit from this optimization. ++ ++ Say Y if you have Intel CPU newer than Pentium Pro, N otherwise. + + config X86_TSC + def_bool y +- depends on (MWINCHIP3D || MCRUSOE || MEFFICEON || MCYRIXIII || MK7 || MK6 || MPENTIUM4 || MPENTIUMM || MPENTIUMIII || MPENTIUMII || M686 || M586MMX || M586TSC || MK8 || MVIAC3_2 || MVIAC7 || MGEODEGX1 || MGEODE_LX || MCORE2 || MATOM) || X86_64 ++ depends on (MWINCHIP3D || MCRUSOE || MEFFICEON || MCYRIXIII || MK7 || MK6 || MPENTIUM4 || MPENTIUMM || MPENTIUMIII || MPENTIUMII || M686 || M586MMX || M586TSC || MK8 || MK8SSE3 || MVIAC3_2 || MVIAC7 || MGEODEGX1 || MGEODE_LX || MCORE2 || MNEHALEM || MWESTMERE || MSILVERMONT || MSANDYBRIDGE || MIVYBRIDGE || MHASWELL || MBROADWELL || MSKYLAKE || MSKYLAKEX || MCANNONLAKE || MICELAKE || MNATIVE || MATOM) || X86_64 + + config X86_CMPXCHG64 + def_bool y +@@ -367,7 +556,7 @@ config X86_CMPXCHG64 + # generates cmov. + config X86_CMOV + def_bool y +- depends on (MK8 || MK7 || MCORE2 || MPENTIUM4 || MPENTIUMM || MPENTIUMIII || MPENTIUMII || M686 || MVIAC3_2 || MVIAC7 || MCRUSOE || MEFFICEON || X86_64 || MATOM || MGEODE_LX) ++ depends on (MK8 || MK8SSE3 || MK10 || MBARCELONA || MBOBCAT || MBULLDOZER || MPILEDRIVER || MSTEAMROLLER || MEXCAVATOR || MZEN || MJAGUAR || MK7 || MCORE2 || MNEHALEM || MWESTMERE || MSILVERMONT || MSANDYBRIDGE || MIVYBRIDGE || MHASWELL || MBROADWELL || MSKYLAKE || MSKYLAKEX || MCANNONLAKE || MICELAKE || MPENTIUM4 || MPENTIUMM || MPENTIUMIII || MPENTIUMII || M686 || MVIAC3_2 || MVIAC7 || MCRUSOE || MEFFICEON || X86_64 || MNATIVE || MATOM || MGEODE_LX) + + config X86_MINIMUM_CPU_FAMILY + int +diff --git a/arch/x86/Makefile b/arch/x86/Makefile +index 85a66c4a8b65..1cdf77b9800e 100644 +--- a/arch/x86/Makefile ++++ b/arch/x86/Makefile +@@ -118,13 +118,46 @@ else + KBUILD_CFLAGS += $(call cc-option,-mskip-rax-setup) + + # FIXME - should be integrated in Makefile.cpu (Makefile_32.cpu) ++ cflags-$(CONFIG_MNATIVE) += $(call cc-option,-march=native) + cflags-$(CONFIG_MK8) += $(call cc-option,-march=k8) ++ cflags-$(CONFIG_MK8SSE3) += $(call cc-option,-march=k8-sse3,-mtune=k8) ++ cflags-$(CONFIG_MK10) += $(call cc-option,-march=amdfam10) ++ cflags-$(CONFIG_MBARCELONA) += $(call cc-option,-march=barcelona) ++ cflags-$(CONFIG_MBOBCAT) += $(call cc-option,-march=btver1) ++ cflags-$(CONFIG_MJAGUAR) += $(call cc-option,-march=btver2) ++ cflags-$(CONFIG_MBULLDOZER) += $(call cc-option,-march=bdver1) ++ cflags-$(CONFIG_MPILEDRIVER) += $(call cc-option,-march=bdver2) ++ cflags-$(CONFIG_MSTEAMROLLER) += $(call cc-option,-march=bdver3) ++ cflags-$(CONFIG_MEXCAVATOR) += $(call cc-option,-march=bdver4) ++ cflags-$(CONFIG_MZEN) += $(call cc-option,-march=znver1) + cflags-$(CONFIG_MPSC) += $(call cc-option,-march=nocona) + + cflags-$(CONFIG_MCORE2) += \ +- $(call cc-option,-march=core2,$(call cc-option,-mtune=generic)) +- cflags-$(CONFIG_MATOM) += $(call cc-option,-march=atom) \ +- $(call cc-option,-mtune=atom,$(call cc-option,-mtune=generic)) ++ $(call cc-option,-march=core2,$(call cc-option,-mtune=core2)) ++ cflags-$(CONFIG_MNEHALEM) += \ ++ $(call cc-option,-march=nehalem,$(call cc-option,-mtune=nehalem)) ++ cflags-$(CONFIG_MWESTMERE) += \ ++ $(call cc-option,-march=westmere,$(call cc-option,-mtune=westmere)) ++ cflags-$(CONFIG_MSILVERMONT) += \ ++ $(call cc-option,-march=silvermont,$(call cc-option,-mtune=silvermont)) ++ cflags-$(CONFIG_MSANDYBRIDGE) += \ ++ $(call cc-option,-march=sandybridge,$(call cc-option,-mtune=sandybridge)) ++ cflags-$(CONFIG_MIVYBRIDGE) += \ ++ $(call cc-option,-march=ivybridge,$(call cc-option,-mtune=ivybridge)) ++ cflags-$(CONFIG_MHASWELL) += \ ++ $(call cc-option,-march=haswell,$(call cc-option,-mtune=haswell)) ++ cflags-$(CONFIG_MBROADWELL) += \ ++ $(call cc-option,-march=broadwell,$(call cc-option,-mtune=broadwell)) ++ cflags-$(CONFIG_MSKYLAKE) += \ ++ $(call cc-option,-march=skylake,$(call cc-option,-mtune=skylake)) ++ cflags-$(CONFIG_MSKYLAKEX) += \ ++ $(call cc-option,-march=skylake-avx512,$(call cc-option,-mtune=skylake-avx512)) ++ cflags-$(CONFIG_MCANNONLAKE) += \ ++ $(call cc-option,-march=cannonlake,$(call cc-option,-mtune=cannonlake)) ++ cflags-$(CONFIG_MICELAKE) += \ ++ $(call cc-option,-march=icelake,$(call cc-option,-mtune=icelake)) ++ cflags-$(CONFIG_MATOM) += $(call cc-option,-march=bonnell) \ ++ $(call cc-option,-mtune=bonnell,$(call cc-option,-mtune=generic)) + cflags-$(CONFIG_GENERIC_CPU) += $(call cc-option,-mtune=generic) + KBUILD_CFLAGS += $(cflags-y) + +diff --git a/arch/x86/Makefile_32.cpu b/arch/x86/Makefile_32.cpu +index 1f5faf8606b4..14a6d19995cc 100644 +--- a/arch/x86/Makefile_32.cpu ++++ b/arch/x86/Makefile_32.cpu +@@ -23,7 +23,18 @@ cflags-$(CONFIG_MK6) += -march=k6 + # Please note, that patches that add -march=athlon-xp and friends are pointless. + # They make zero difference whatsosever to performance at this time. + cflags-$(CONFIG_MK7) += -march=athlon ++cflags-$(CONFIG_MNATIVE) += $(call cc-option,-march=native) + cflags-$(CONFIG_MK8) += $(call cc-option,-march=k8,-march=athlon) ++cflags-$(CONFIG_MK8SSE3) += $(call cc-option,-march=k8-sse3,-march=athlon) ++cflags-$(CONFIG_MK10) += $(call cc-option,-march=amdfam10,-march=athlon) ++cflags-$(CONFIG_MBARCELONA) += $(call cc-option,-march=barcelona,-march=athlon) ++cflags-$(CONFIG_MBOBCAT) += $(call cc-option,-march=btver1,-march=athlon) ++cflags-$(CONFIG_MJAGUAR) += $(call cc-option,-march=btver2,-march=athlon) ++cflags-$(CONFIG_MBULLDOZER) += $(call cc-option,-march=bdver1,-march=athlon) ++cflags-$(CONFIG_MPILEDRIVER) += $(call cc-option,-march=bdver2,-march=athlon) ++cflags-$(CONFIG_MSTEAMROLLER) += $(call cc-option,-march=bdver3,-march=athlon) ++cflags-$(CONFIG_MEXCAVATOR) += $(call cc-option,-march=bdver4,-march=athlon) ++cflags-$(CONFIG_MZEN) += $(call cc-option,-march=znver1,-march=athlon) + cflags-$(CONFIG_MCRUSOE) += -march=i686 -falign-functions=0 -falign-jumps=0 -falign-loops=0 + cflags-$(CONFIG_MEFFICEON) += -march=i686 $(call tune,pentium3) -falign-functions=0 -falign-jumps=0 -falign-loops=0 + cflags-$(CONFIG_MWINCHIPC6) += $(call cc-option,-march=winchip-c6,-march=i586) +@@ -32,8 +43,19 @@ cflags-$(CONFIG_MCYRIXIII) += $(call cc-option,-march=c3,-march=i486) -falign-fu + cflags-$(CONFIG_MVIAC3_2) += $(call cc-option,-march=c3-2,-march=i686) + cflags-$(CONFIG_MVIAC7) += -march=i686 + cflags-$(CONFIG_MCORE2) += -march=i686 $(call tune,core2) +-cflags-$(CONFIG_MATOM) += $(call cc-option,-march=atom,$(call cc-option,-march=core2,-march=i686)) \ +- $(call cc-option,-mtune=atom,$(call cc-option,-mtune=generic)) ++cflags-$(CONFIG_MNEHALEM) += -march=i686 $(call tune,nehalem) ++cflags-$(CONFIG_MWESTMERE) += -march=i686 $(call tune,westmere) ++cflags-$(CONFIG_MSILVERMONT) += -march=i686 $(call tune,silvermont) ++cflags-$(CONFIG_MSANDYBRIDGE) += -march=i686 $(call tune,sandybridge) ++cflags-$(CONFIG_MIVYBRIDGE) += -march=i686 $(call tune,ivybridge) ++cflags-$(CONFIG_MHASWELL) += -march=i686 $(call tune,haswell) ++cflags-$(CONFIG_MBROADWELL) += -march=i686 $(call tune,broadwell) ++cflags-$(CONFIG_MSKYLAKE) += -march=i686 $(call tune,skylake) ++cflags-$(CONFIG_MSKYLAKEX) += -march=i686 $(call tune,skylake-avx512) ++cflags-$(CONFIG_MCANNONLAKE) += -march=i686 $(call tune,cannonlake) ++cflags-$(CONFIG_MICELAKE) += -march=i686 $(call tune,icelake) ++cflags-$(CONFIG_MATOM) += $(call cc-option,-march=bonnell,$(call cc-option,-march=core2,-march=i686)) \ ++ $(call cc-option,-mtune=bonnell,$(call cc-option,-mtune=generic)) + + # AMD Elan support + cflags-$(CONFIG_MELAN) += -march=i486 +diff --git a/arch/x86/include/asm/module.h b/arch/x86/include/asm/module.h +index 7948a17febb4..44b776297dc3 100644 +--- a/arch/x86/include/asm/module.h ++++ b/arch/x86/include/asm/module.h +@@ -25,6 +25,30 @@ struct mod_arch_specific { + #define MODULE_PROC_FAMILY "586MMX " + #elif defined CONFIG_MCORE2 + #define MODULE_PROC_FAMILY "CORE2 " ++#elif defined CONFIG_MNATIVE ++#define MODULE_PROC_FAMILY "NATIVE " ++#elif defined CONFIG_MNEHALEM ++#define MODULE_PROC_FAMILY "NEHALEM " ++#elif defined CONFIG_MWESTMERE ++#define MODULE_PROC_FAMILY "WESTMERE " ++#elif defined CONFIG_MSILVERMONT ++#define MODULE_PROC_FAMILY "SILVERMONT " ++#elif defined CONFIG_MSANDYBRIDGE ++#define MODULE_PROC_FAMILY "SANDYBRIDGE " ++#elif defined CONFIG_MIVYBRIDGE ++#define MODULE_PROC_FAMILY "IVYBRIDGE " ++#elif defined CONFIG_MHASWELL ++#define MODULE_PROC_FAMILY "HASWELL " ++#elif defined CONFIG_MBROADWELL ++#define MODULE_PROC_FAMILY "BROADWELL " ++#elif defined CONFIG_MSKYLAKE ++#define MODULE_PROC_FAMILY "SKYLAKE " ++#elif defined CONFIG_MSKYLAKEX ++#define MODULE_PROC_FAMILY "SKYLAKEX " ++#elif defined CONFIG_MCANNONLAKE ++#define MODULE_PROC_FAMILY "CANNONLAKE " ++#elif defined CONFIG_MICELAKE ++#define MODULE_PROC_FAMILY "ICELAKE " + #elif defined CONFIG_MATOM + #define MODULE_PROC_FAMILY "ATOM " + #elif defined CONFIG_M686 +@@ -43,6 +67,26 @@ struct mod_arch_specific { + #define MODULE_PROC_FAMILY "K7 " + #elif defined CONFIG_MK8 + #define MODULE_PROC_FAMILY "K8 " ++#elif defined CONFIG_MK8SSE3 ++#define MODULE_PROC_FAMILY "K8SSE3 " ++#elif defined CONFIG_MK10 ++#define MODULE_PROC_FAMILY "K10 " ++#elif defined CONFIG_MBARCELONA ++#define MODULE_PROC_FAMILY "BARCELONA " ++#elif defined CONFIG_MBOBCAT ++#define MODULE_PROC_FAMILY "BOBCAT " ++#elif defined CONFIG_MBULLDOZER ++#define MODULE_PROC_FAMILY "BULLDOZER " ++#elif defined CONFIG_MPILEDRIVER ++#define MODULE_PROC_FAMILY "PILEDRIVER " ++#elif defined CONFIG_MSTEAMROLLER ++#define MODULE_PROC_FAMILY "STEAMROLLER " ++#elif defined CONFIG_MJAGUAR ++#define MODULE_PROC_FAMILY "JAGUAR " ++#elif defined CONFIG_MEXCAVATOR ++#define MODULE_PROC_FAMILY "EXCAVATOR " ++#elif defined CONFIG_MZEN ++#define MODULE_PROC_FAMILY "ZEN " + #elif defined CONFIG_MELAN + #define MODULE_PROC_FAMILY "ELAN " + #elif defined CONFIG_MCRUSOE + +diff --git a/arch/x86/Kconfig.cpu b/arch/x86/Kconfig.cpu +index f8612b07d32ff98381b61cbb21a116af6317c5e9..40f651bb363a0c368bc28c3ff0112d60c757003f 100644 +--- a/arch/x86/Kconfig.cpu ++++ b/arch/x86/Kconfig.cpu +@@ -243,6 +243,13 @@ config MZEN + + Enables -march=znver1 + ++config MZEN2 ++ bool "AMD Zen 2" ++ ---help--- ++ Select this for AMD Family 17h Zen 2 processors. ++ ++ Enables -march=znver2 ++ + config MCRUSOE + bool "Crusoe" + depends on X86_32 +@@ -499,7 +506,7 @@ config X86_INTERNODE_CACHE_SHIFT + config X86_L1_CACHE_SHIFT + int + default "7" if MPENTIUM4 || MPSC +- default "6" if MK7 || MK8 || MK8SSE3 || MK10 || MBARCELONA || MBOBCAT || MBULLDOZER || MPILEDRIVER || MSTEAMROLLER || MEXCAVATOR || MZEN || MJAGUAR || MPENTIUMM || MCORE2 || MNEHALEM || MWESTMERE || MSILVERMONT || MSANDYBRIDGE || MIVYBRIDGE || MHASWELL || MBROADWELL || MSKYLAKE || MSKYLAKEX || MCANNONLAKE || MICELAKE || MNATIVE || MATOM || MVIAC7 || X86_GENERIC || GENERIC_CPU ++ default "6" if MK7 || MK8 || MK8SSE3 || MK10 || MBARCELONA || MBOBCAT || MBULLDOZER || MPILEDRIVER || MSTEAMROLLER || MEXCAVATOR || MZEN || MZEN2 || MJAGUAR || MPENTIUMM || MCORE2 || MNEHALEM || MWESTMERE || MSILVERMONT || MSANDYBRIDGE || MIVYBRIDGE || MHASWELL || MBROADWELL || MSKYLAKE || MSKYLAKEX || MCANNONLAKE || MICELAKE || MNATIVE || MATOM || MVIAC7 || X86_GENERIC || GENERIC_CPU + default "4" if MELAN || M486 || MGEODEGX1 + default "5" if MWINCHIP3D || MWINCHIPC6 || MCRUSOE || MEFFICEON || MCYRIXIII || MK6 || MPENTIUMIII || MPENTIUMII || M686 || M586MMX || M586TSC || M586 || MVIAC3_2 || MGEODE_LX + +@@ -556,7 +563,7 @@ config X86_CMPXCHG64 + # generates cmov. + config X86_CMOV + def_bool y +- depends on (MK8 || MK8SSE3 || MK10 || MBARCELONA || MBOBCAT || MBULLDOZER || MPILEDRIVER || MSTEAMROLLER || MEXCAVATOR || MZEN || MJAGUAR || MK7 || MCORE2 || MNEHALEM || MWESTMERE || MSILVERMONT || MSANDYBRIDGE || MIVYBRIDGE || MHASWELL || MBROADWELL || MSKYLAKE || MSKYLAKEX || MCANNONLAKE || MICELAKE || MPENTIUM4 || MPENTIUMM || MPENTIUMIII || MPENTIUMII || M686 || MVIAC3_2 || MVIAC7 || MCRUSOE || MEFFICEON || X86_64 || MNATIVE || MATOM || MGEODE_LX) ++ depends on (MK8 || MK8SSE3 || MK10 || MBARCELONA || MBOBCAT || MBULLDOZER || MPILEDRIVER || MSTEAMROLLER || MEXCAVATOR || MZEN || MZEN2 || MJAGUAR || MK7 || MCORE2 || MNEHALEM || MWESTMERE || MSILVERMONT || MSANDYBRIDGE || MIVYBRIDGE || MHASWELL || MBROADWELL || MSKYLAKE || MSKYLAKEX || MCANNONLAKE || MICELAKE || MPENTIUM4 || MPENTIUMM || MPENTIUMIII || MPENTIUMII || M686 || MVIAC3_2 || MVIAC7 || MCRUSOE || MEFFICEON || X86_64 || MNATIVE || MATOM || MGEODE_LX) + + config X86_MINIMUM_CPU_FAMILY + int +diff --git a/arch/x86/Makefile b/arch/x86/Makefile +index 29eb64851528d79f0838cdeb73789d940090cad2..7204717ac8e210f1150073f80ccfe51ce83443fe 100644 +--- a/arch/x86/Makefile ++++ b/arch/x86/Makefile +@@ -130,6 +130,7 @@ else + cflags-$(CONFIG_MSTEAMROLLER) += $(call cc-option,-march=bdver3) + cflags-$(CONFIG_MEXCAVATOR) += $(call cc-option,-march=bdver4) + cflags-$(CONFIG_MZEN) += $(call cc-option,-march=znver1) ++ cflags-$(CONFIG_MZEN2) += $(call cc-option,-march=znver2) + cflags-$(CONFIG_MPSC) += $(call cc-option,-march=nocona) + + cflags-$(CONFIG_MCORE2) += \ +diff --git a/arch/x86/Makefile_32.cpu b/arch/x86/Makefile_32.cpu +index 14a6d19995cc89bec7a3c513c4e9f65678b37228..a4b8484d61da9286d10e1c07272b585268f93067 100644 +--- a/arch/x86/Makefile_32.cpu ++++ b/arch/x86/Makefile_32.cpu +@@ -35,6 +35,7 @@ cflags-$(CONFIG_MPILEDRIVER) += $(call cc-option,-march=bdver2,-march=athlon) + cflags-$(CONFIG_MSTEAMROLLER) += $(call cc-option,-march=bdver3,-march=athlon) + cflags-$(CONFIG_MEXCAVATOR) += $(call cc-option,-march=bdver4,-march=athlon) + cflags-$(CONFIG_MZEN) += $(call cc-option,-march=znver1,-march=athlon) ++cflags-$(CONFIG_MZEN2) += $(call cc-option,-march=znver2,-march=athlon) + cflags-$(CONFIG_MCRUSOE) += -march=i686 -falign-functions=0 -falign-jumps=0 -falign-loops=0 + cflags-$(CONFIG_MEFFICEON) += -march=i686 $(call tune,pentium3) -falign-functions=0 -falign-jumps=0 -falign-loops=0 + cflags-$(CONFIG_MWINCHIPC6) += $(call cc-option,-march=winchip-c6,-march=i586) +diff --git a/arch/x86/include/asm/module.h b/arch/x86/include/asm/module.h +index 44b776297dc3264a2f1241d13bb0f37021f97f13..630aa88f7de38fac27cf164d655c9202bde467ae 100644 +--- a/arch/x86/include/asm/module.h ++++ b/arch/x86/include/asm/module.h +@@ -87,6 +87,8 @@ struct mod_arch_specific { + #define MODULE_PROC_FAMILY "EXCAVATOR " + #elif defined CONFIG_MZEN + #define MODULE_PROC_FAMILY "ZEN " ++#elif defined CONFIG_MZEN2 ++#define MODULE_PROC_FAMILY "ZEN2 " + #elif defined CONFIG_MELAN + #define MODULE_PROC_FAMILY "ELAN " + #elif defined CONFIG_MCRUSOE + +diff --git a/fs/dcache.c b/fs/dcache.c +index 2acfc69878f5..3f1131431e06 100644 +--- a/fs/dcache.c ++++ b/fs/dcache.c +@@ -69,7 +69,7 @@ + * If no ancestor relationship: + * arbitrary, since it's serialized on rename_lock + */ +-int sysctl_vfs_cache_pressure __read_mostly = 100; ++int sysctl_vfs_cache_pressure __read_mostly = 50; + EXPORT_SYMBOL_GPL(sysctl_vfs_cache_pressure); + + __cacheline_aligned_in_smp DEFINE_SEQLOCK(rename_lock); +diff --git a/kernel/sched/core.c b/kernel/sched/core.c +index 211890edf37e..37121563407d 100644 +--- a/kernel/sched/core.c ++++ b/kernel/sched/core.c +@@ -41,7 +41,7 @@ const_debug unsigned int sysctl_sched_features = + * Number of tasks to iterate in a single balance run. + * Limited because this is done with IRQs disabled. + */ +-const_debug unsigned int sysctl_sched_nr_migrate = 32; ++const_debug unsigned int sysctl_sched_nr_migrate = 128; + + /* + * period over which we average the RT time consumption, measured +@@ -61,9 +61,9 @@ __read_mostly int scheduler_running; + + /* + * part of the period that we allow rt tasks to run in us. +- * default: 0.95s ++ * XanMod default: 0.98s + */ +-int sysctl_sched_rt_runtime = 950000; ++int sysctl_sched_rt_runtime = 980000; + + /* + * __task_rq_lock - lock the rq @p resides on. +diff --git a/lib/Kconfig b/lib/Kconfig +index 5fe577673b98..c44c27cd6e05 100644 +--- a/lib/Kconfig ++++ b/lib/Kconfig +@@ -10,6 +10,16 @@ menu "Library routines" + config RAID6_PQ + tristate + ++config RAID6_USE_PREFER_GEN ++ bool "Use prefered raid6 gen function." ++ default n ++ depends on RAID6_PQ ++ help ++ This option is provided for using prefered raid6 gen function ++ directly instead of calculating the best durning boot-up. ++ The prefered function should be the same as the best one from ++ calculating. ++ + config BITREVERSE + tristate + +diff --git a/lib/raid6/algos.c b/lib/raid6/algos.c +index 5065b1e7e327..1bf3c712a4ca 100644 +--- a/lib/raid6/algos.c ++++ b/lib/raid6/algos.c +@@ -150,6 +150,29 @@ static inline const struct raid6_recov_calls *raid6_choose_recov(void) + return best; + } + ++#ifdef CONFIG_RAID6_USE_PREFER_GEN ++static inline const struct raid6_calls *raid6_choose_prefer_gen(void) ++{ ++ const struct raid6_calls *const *algo; ++ const struct raid6_calls *best; ++ ++ for (best = NULL, algo = raid6_algos; *algo; algo++) { ++ if (!best || (*algo)->prefer >= best->prefer) { ++ if ((*algo)->valid && !(*algo)->valid()) ++ continue; ++ best = *algo; ++ } ++ } ++ ++ if (best) { ++ printk("raid6: using algorithm %s\n", best->name); ++ raid6_call = *best; ++ } else ++ printk("raid6: Yikes! No algorithm found!\n"); ++ ++ return best; ++} ++#else + static inline const struct raid6_calls *raid6_choose_gen( + void *(*const dptrs)[(65536/PAGE_SIZE)+2], const int disks) + { +@@ -221,6 +244,7 @@ static inline const struct raid6_calls *raid6_choose_gen( + + return best; + } ++#endif + + + /* Try to pick the best algorithm */ +@@ -228,10 +252,11 @@ static inline const struct raid6_calls *raid6_choose_gen( + + int __init raid6_select_algo(void) + { +- const int disks = (65536/PAGE_SIZE)+2; +- + const struct raid6_calls *gen_best; + const struct raid6_recov_calls *rec_best; ++#ifndef CONFIG_RAID6_USE_PREFER_GEN ++ const int disks = (65536/PAGE_SIZE)+2; ++ + char *syndromes; + void *dptrs[(65536/PAGE_SIZE)+2]; + int i; +@@ -252,11 +277,16 @@ int __init raid6_select_algo(void) + + /* select raid gen_syndrome function */ + gen_best = raid6_choose_gen(&dptrs, disks); ++#else ++ gen_best = raid6_choose_prefer_gen(); ++#endif + + /* select raid recover functions */ + rec_best = raid6_choose_recov(); + ++#ifndef CONFIG_RAID6_USE_PREFER_GEN + free_pages((unsigned long)syndromes, 1); ++#endif + + return gen_best && rec_best ? 0 : -EINVAL; + } +diff --git a/mm/zswap.c b/mm/zswap.c +index 61a5c41972db..2674c2806130 100644 +--- a/mm/zswap.c ++++ b/mm/zswap.c +@@ -91,7 +91,7 @@ static struct kernel_param_ops zswap_enabled_param_ops = { + module_param_cb(enabled, &zswap_enabled_param_ops, &zswap_enabled, 0644); + + /* Crypto compressor to use */ +-#define ZSWAP_COMPRESSOR_DEFAULT "lzo" ++#define ZSWAP_COMPRESSOR_DEFAULT "lz4" + static char *zswap_compressor = ZSWAP_COMPRESSOR_DEFAULT; + static int zswap_compressor_param_set(const char *, + const struct kernel_param *); +diff --git a/scripts/setlocalversion b/scripts/setlocalversion +index 71f39410691b..288f9679e883 100755 +--- a/scripts/setlocalversion ++++ b/scripts/setlocalversion +@@ -54,7 +54,7 @@ scm_version() + # If only the short version is requested, don't bother + # running further git commands + if $short; then +- echo "+" ++ # echo "+" + return + fi + # If we are past a tagged commit (like + +From f85ed068b4d0e6c31edce8574a95757a60e58b87 Mon Sep 17 00:00:00 2001 +From: Etienne Juvigny +Date: Mon, 3 Sep 2018 17:36:25 +0200 +Subject: Zenify & stuff + + +diff --git a/Documentation/tp_smapi.txt b/Documentation/tp_smapi.txt +new file mode 100644 +index 000000000000..a249678a8866 +--- /dev/null ++++ b/Documentation/tp_smapi.txt +@@ -0,0 +1,275 @@ ++tp_smapi version 0.42 ++IBM ThinkPad hardware functions driver ++ ++Author: Shem Multinymous ++Project: http://sourceforge.net/projects/tpctl ++Wiki: http://thinkwiki.org/wiki/tp_smapi ++List: linux-thinkpad@linux-thinkpad.org ++ (http://mailman.linux-thinkpad.org/mailman/listinfo/linux-thinkpad) ++ ++Description ++----------- ++ ++ThinkPad laptops include a proprietary interface called SMAPI BIOS ++(System Management Application Program Interface) which provides some ++hardware control functionality that is not accessible by other means. ++ ++This driver exposes some features of the SMAPI BIOS through a sysfs ++interface. It is suitable for newer models, on which SMAPI is invoked ++through IO port writes. Older models use a different SMAPI interface; ++for those, try the "thinkpad" module from the "tpctl" package. ++ ++WARNING: ++This driver uses undocumented features and direct hardware access. ++It thus cannot be guaranteed to work, and may cause arbitrary damage ++(especially on models it wasn't tested on). ++ ++ ++Module parameters ++----------------- ++ ++thinkpad_ec module: ++ force_io=1 lets thinkpad_ec load on some recent ThinkPad models ++ (e.g., T400 and T500) whose BIOS's ACPI DSDT reserves the ports we need. ++tp_smapi module: ++ debug=1 enables verbose dmesg output. ++ ++ ++Usage ++----- ++ ++Control of battery charging thresholds (in percents of current full charge ++capacity): ++ ++# echo 40 > /sys/devices/platform/smapi/BAT0/start_charge_thresh ++# echo 70 > /sys/devices/platform/smapi/BAT0/stop_charge_thresh ++# cat /sys/devices/platform/smapi/BAT0/*_charge_thresh ++ ++ (This is useful since Li-Ion batteries wear out much faster at very ++ high or low charge levels. The driver will also keeps the thresholds ++ across suspend-to-disk with AC disconnected; this isn't done ++ automatically by the hardware.) ++ ++Inhibiting battery charging for 17 minutes (overrides thresholds): ++ ++# echo 17 > /sys/devices/platform/smapi/BAT0/inhibit_charge_minutes ++# echo 0 > /sys/devices/platform/smapi/BAT0/inhibit_charge_minutes # stop ++# cat /sys/devices/platform/smapi/BAT0/inhibit_charge_minutes ++ ++ (This can be used to control which battery is charged when using an ++ Ultrabay battery.) ++ ++Forcing battery discharging even if AC power available: ++ ++# echo 1 > /sys/devices/platform/smapi/BAT0/force_discharge # start discharge ++# echo 0 > /sys/devices/platform/smapi/BAT0/force_discharge # stop discharge ++# cat /sys/devices/platform/smapi/BAT0/force_discharge ++ ++ (When AC is connected, forced discharging will automatically stop ++ when battery is fully depleted -- this is useful for calibration. ++ Also, this attribute can be used to control which battery is discharged ++ when both a system battery and an Ultrabay battery are connected.) ++ ++Misc read-only battery status attributes (see note about HDAPS below): ++ ++/sys/devices/platform/smapi/BAT0/installed # 0 or 1 ++/sys/devices/platform/smapi/BAT0/state # idle/charging/discharging ++/sys/devices/platform/smapi/BAT0/cycle_count # integer counter ++/sys/devices/platform/smapi/BAT0/current_now # instantaneous current ++/sys/devices/platform/smapi/BAT0/current_avg # last minute average ++/sys/devices/platform/smapi/BAT0/power_now # instantaneous power ++/sys/devices/platform/smapi/BAT0/power_avg # last minute average ++/sys/devices/platform/smapi/BAT0/last_full_capacity # in mWh ++/sys/devices/platform/smapi/BAT0/remaining_percent # remaining percent of energy (set by calibration) ++/sys/devices/platform/smapi/BAT0/remaining_percent_error # error range of remaing_percent (not reset by calibration) ++/sys/devices/platform/smapi/BAT0/remaining_running_time # in minutes, by last minute average power ++/sys/devices/platform/smapi/BAT0/remaining_running_time_now # in minutes, by instantenous power ++/sys/devices/platform/smapi/BAT0/remaining_charging_time # in minutes ++/sys/devices/platform/smapi/BAT0/remaining_capacity # in mWh ++/sys/devices/platform/smapi/BAT0/design_capacity # in mWh ++/sys/devices/platform/smapi/BAT0/voltage # in mV ++/sys/devices/platform/smapi/BAT0/design_voltage # in mV ++/sys/devices/platform/smapi/BAT0/charging_max_current # max charging current ++/sys/devices/platform/smapi/BAT0/charging_max_voltage # max charging voltage ++/sys/devices/platform/smapi/BAT0/group{0,1,2,3}_voltage # see below ++/sys/devices/platform/smapi/BAT0/manufacturer # string ++/sys/devices/platform/smapi/BAT0/model # string ++/sys/devices/platform/smapi/BAT0/barcoding # string ++/sys/devices/platform/smapi/BAT0/chemistry # string ++/sys/devices/platform/smapi/BAT0/serial # integer ++/sys/devices/platform/smapi/BAT0/manufacture_date # YYYY-MM-DD ++/sys/devices/platform/smapi/BAT0/first_use_date # YYYY-MM-DD ++/sys/devices/platform/smapi/BAT0/temperature # in milli-Celsius ++/sys/devices/platform/smapi/BAT0/dump # see below ++/sys/devices/platform/smapi/ac_connected # 0 or 1 ++ ++The BAT0/group{0,1,2,3}_voltage attribute refers to the separate cell groups ++in each battery. For example, on the ThinkPad 600, X3x, T4x and R5x models, ++the battery contains 3 cell groups in series, where each group consisting of 2 ++or 3 cells connected in parallel. The voltage of each group is given by these ++attributes, and their sum (roughly) equals the "voltage" attribute. ++(The effective performance of the battery is determined by the weakest group, ++i.e., the one those voltage changes most rapidly during dis/charging.) ++ ++The "BAT0/dump" attribute gives a a hex dump of the raw status data, which ++contains additional data now in the above (if you can figure it out). Some ++unused values are autodetected and replaced by "--": ++ ++In all of the above, replace BAT0 with BAT1 to address the 2nd battery (e.g. ++in the UltraBay). ++ ++ ++Raw SMAPI calls: ++ ++/sys/devices/platform/smapi/smapi_request ++This performs raw SMAPI calls. It uses a bad interface that cannot handle ++multiple simultaneous access. Don't touch it, it's for development only. ++If you did touch it, you would so something like ++# echo '211a 100 0 0' > /sys/devices/platform/smapi/smapi_request ++# cat /sys/devices/platform/smapi/smapi_request ++and notice that in the output "211a 34b b2 0 0 0 'OK'", the "4b" in the 2nd ++value, converted to decimal is 75: the current charge stop threshold. ++ ++ ++Model-specific status ++--------------------- ++ ++Works (at least partially) on the following ThinkPad model: ++* A30 ++* G41 ++* R40, R50p, R51, R52 ++* T23, T40, T40p, T41, T41p, T42, T42p, T43, T43p, T60, T61, T400, T410, T420 (partially) ++* X24, X31, X32, X40, X41, X60, X61, X200, X201, X220 (partially) ++* Z60t, Z61m ++ ++Does not work on: ++* X230 and newer ++* T430 and newer ++* Any ThinkPad Edge ++* Any ThinkPad Yoga ++* Any ThinkPad L series ++* Any ThinkPad P series ++ ++Not all functions are available on all models; for detailed status, see: ++ http://thinkwiki.org/wiki/tp_smapi ++ ++Please report success/failure by e-mail or on the Wiki. ++If you get a "not implemented" or "not supported" message, your laptop ++probably just can't do that (at least not via the SMAPI BIOS). ++For negative reports, follow the bug reporting guidelines below. ++If you send me the necessary technical data (i.e., SMAPI function ++interfaces), I will support additional models. ++ ++ ++Additional HDAPS features ++------------------------- ++ ++The modified hdaps driver has several improvements on the one in mainline ++(beyond resolving the conflict with thinkpad_ec and tp_smapi): ++ ++- Fixes reliability and improves support for recent ThinkPad models ++ (especially *60 and newer). Unlike the mainline driver, the modified hdaps ++ correctly follows the Embedded Controller communication protocol. ++ ++- Extends the "invert" parameter to cover all possible axis orientations. ++ The possible values are as follows. ++ Let X,Y denote the hardware readouts. ++ Let R denote the laptop's roll (tilt left/right). ++ Let P denote the laptop's pitch (tilt forward/backward). ++ invert=0: R= X P= Y (same as mainline) ++ invert=1: R=-X P=-Y (same as mainline) ++ invert=2: R=-X P= Y (new) ++ invert=3: R= X P=-Y (new) ++ invert=4: R= Y P= X (new) ++ invert=5: R=-Y P=-X (new) ++ invert=6: R=-Y P= X (new) ++ invert=7: R= Y P=-X (new) ++ It's probably easiest to just try all 8 possibilities and see which yields ++ correct results (e.g., in the hdaps-gl visualisation). ++ ++- Adds a whitelist which automatically sets the correct axis orientation for ++ some models. If the value for your model is wrong or missing, you can override ++ it using the "invert" parameter. Please also update the tables at ++ http://www.thinkwiki.org/wiki/tp_smapi and ++ http://www.thinkwiki.org/wiki/List_of_DMI_IDs ++ and submit a patch for the whitelist in hdaps.c. ++ ++- Provides new attributes: ++ /sys/devices/platform/hdaps/sampling_rate: ++ This determines the frequency at which the host queries the embedded ++ controller for accelerometer data (and informs the hdaps input devices). ++ Default=50. ++ /sys/devices/platform/hdaps/oversampling_ratio: ++ When set to X, the embedded controller is told to do physical accelerometer ++ measurements at a rate that is X times higher than the rate at which ++ the driver reads those measurements (i.e., X*sampling_rate). This ++ makes the readouts from the embedded controller more fresh, and is also ++ useful for the running average filter (see next). Default=5 ++ /sys/devices/platform/hdaps/running_avg_filter_order: ++ When set to X, reported readouts will be the average of the last X physical ++ accelerometer measurements. Current firmware allows 1<=X<=8. Setting to a ++ high value decreases readout fluctuations. The averaging is handled by the ++ embedded controller, so no CPU resources are used. Higher values make the ++ readouts smoother, since it averages out both sensor noise (good) and abrupt ++ changes (bad). Default=2. ++ ++- Provides a second input device, which publishes the raw accelerometer ++ measurements (without the fuzzing needed for joystick emulation). This input ++ device can be matched by a udev rule such as the following (all on one line): ++ KERNEL=="event[0-9]*", ATTRS{phys}=="hdaps/input1", ++ ATTRS{modalias}=="input:b0019v1014p5054e4801-*", ++ SYMLINK+="input/hdaps/accelerometer-event ++ ++A new version of the hdapsd userspace daemon, which uses the input device ++interface instead of polling sysfs, is available seprately. Using this reduces ++the total interrupts per second generated by hdaps+hdapsd (on tickless kernels) ++to 50, down from a value that fluctuates between 50 and 100. Set the ++sampling_rate sysfs attribute to a lower value to further reduce interrupts, ++at the expense of response latency. ++ ++Licensing note: all my changes to the HDAPS driver are licensed under the ++GPL version 2 or, at your option and to the extent allowed by derivation from ++prior works, any later version. My version of hdaps is derived work from the ++mainline version, which at the time of writing is available only under ++GPL version 2. ++ ++Bug reporting ++------------- ++ ++Mail . Please include: ++* Details about your model, ++* Relevant "dmesg" output. Make sure thinkpad_ec and tp_smapi are loaded with ++ the "debug=1" parameter (e.g., use "make load HDAPS=1 DEBUG=1"). ++* Output of "dmidecode | grep -C5 Product" ++* Does the failed functionality works under Windows? ++ ++ ++More about SMAPI ++---------------- ++ ++For hints about what may be possible via the SMAPI BIOS and how, see: ++ ++* IBM Technical Reference Manual for the ThinkPad 770 ++ (http://www-307.ibm.com/pc/support/site.wss/document.do?lndocid=PFAN-3TUQQD) ++* Exported symbols in PWRMGRIF.DLL or TPPWRW32.DLL (e.g., use "objdump -x"). ++* drivers/char/mwave/smapi.c in the Linux kernel tree.* ++* The "thinkpad" SMAPI module (http://tpctl.sourceforge.net). ++* The SMAPI_* constants in tp_smapi.c. ++ ++Note that in the above Technical Reference and in the "thinkpad" module, ++SMAPI is invoked through a function call to some physical address. However, ++the interface used by tp_smapi and the above mwave drive, and apparently ++required by newer ThinkPad, is different: you set the parameters up in the ++CPU's registers and write to ports 0xB2 (the APM control port) and 0x4F; this ++triggers an SMI (System Management Interrupt), causing the CPU to enter ++SMM (System Management Mode) and run the BIOS firmware; the results are ++returned in the CPU's registers. It is not clear what is the relation between ++the two variants of SMAPI, though the assignment of error codes seems to be ++similar. ++ ++In addition, the embedded controller on ThinkPad laptops has a non-standard ++interface at IO ports 0x1600-0x161F (mapped to LCP channel 3 of the H8S chip). ++The interface provides various system management services (currently known: ++battery information and accelerometer readouts). For more information see the ++thinkpad_ec module and the H8S hardware documentation: ++http://documentation.renesas.com/eng/products/mpumcu/rej09b0300_2140bhm.pdf +diff --git a/Makefile b/Makefile +index 863f58503bee..f33cf760af6d 100644 +--- a/Makefile ++++ b/Makefile +@@ -682,12 +682,16 @@ ifdef CONFIG_FUNCTION_TRACER + KBUILD_CFLAGS += $(call cc-disable-warning, format-overflow) + KBUILD_CFLAGS += $(call cc-disable-warning, address-of-packed-member) + ++ifdef CONFIG_CC_OPTIMIZE_HARDER ++KBUILD_CFLAGS += -O3 $(call cc-disable-warning,maybe-uninitialized,) ++else + ifdef CONFIG_CC_OPTIMIZE_FOR_PERFORMANCE + KBUILD_CFLAGS += -O2 + else ifdef CONFIG_CC_OPTIMIZE_FOR_PERFORMANCE_O3 + KBUILD_CFLAGS += -O3 + else ifdef CONFIG_CC_OPTIMIZE_FOR_SIZE + KBUILD_CFLAGS += -Os + endif ++endif + + ifdef CONFIG_CC_DISABLE_WARN_MAYBE_UNINITIALIZED + KBUILD_CFLAGS += -Wno-maybe-uninitialized + +diff --git a/drivers/infiniband/core/addr.c b/drivers/infiniband/core/addr.c +index 4f32c4062fb6..c0bf039e1b40 100644 +--- a/drivers/infiniband/core/addr.c ++++ b/drivers/infiniband/core/addr.c +@@ -721,6 +721,7 @@ int rdma_addr_find_l2_eth_by_grh(const union ib_gid *sgid, + struct sockaddr _sockaddr; + struct sockaddr_in _sockaddr_in; + struct sockaddr_in6 _sockaddr_in6; ++ struct sockaddr_ib _sockaddr_ib; + } sgid_addr, dgid_addr; + int ret; + +diff --git a/drivers/input/mouse/synaptics.c b/drivers/input/mouse/synaptics.c +index 55d33500d55e..744e84228a1f 100644 +--- a/drivers/input/mouse/synaptics.c ++++ b/drivers/input/mouse/synaptics.c +@@ -1338,7 +1338,9 @@ static int set_input_params(struct psmouse *psmouse, + if (psmouse_matches_pnp_id(psmouse, topbuttonpad_pnp_ids) && + !SYN_CAP_EXT_BUTTONS_STICK(info->ext_cap_10)) + __set_bit(INPUT_PROP_TOPBUTTONPAD, dev->propbit); +- } ++ } else if (SYN_CAP_CLICKPAD2BTN(info->ext_cap_0c) || ++ SYN_CAP_CLICKPAD2BTN2(info->ext_cap_0c)) ++ __set_bit(INPUT_PROP_BUTTONPAD, dev->propbit); + + return 0; + } +diff --git a/drivers/input/mouse/synaptics.h b/drivers/input/mouse/synaptics.h +index fc00e005c611..4cfbeec3ae4c 100644 +--- a/drivers/input/mouse/synaptics.h ++++ b/drivers/input/mouse/synaptics.h +@@ -86,6 +86,7 @@ + */ + #define SYN_CAP_CLICKPAD(ex0c) ((ex0c) & BIT(20)) /* 1-button ClickPad */ + #define SYN_CAP_CLICKPAD2BTN(ex0c) ((ex0c) & BIT(8)) /* 2-button ClickPad */ ++#define SYN_CAP_CLICKPAD2BTN2(ex0c) ((ex0c) & BIT(21)) /* 2-button ClickPad */ + #define SYN_CAP_MAX_DIMENSIONS(ex0c) ((ex0c) & BIT(17)) + #define SYN_CAP_MIN_DIMENSIONS(ex0c) ((ex0c) & BIT(13)) + #define SYN_CAP_ADV_GESTURE(ex0c) ((ex0c) & BIT(19)) +diff --git a/drivers/macintosh/Kconfig b/drivers/macintosh/Kconfig +index 97a420c11eed..c8621e9b2e4a 100644 +--- a/drivers/macintosh/Kconfig ++++ b/drivers/macintosh/Kconfig +@@ -159,6 +159,13 @@ config INPUT_ADBHID + + If unsure, say Y. + ++config ADB_TRACKPAD_ABSOLUTE ++ bool "Enable absolute mode for adb trackpads" ++ depends on INPUT_ADBHID ++ help ++ Enable absolute mode in adb-base trackpads. This feature adds ++ compatibility with synaptics Xorg / Xfree drivers. ++ + config MAC_EMUMOUSEBTN + tristate "Support for mouse button 2+3 emulation" + depends on SYSCTL && INPUT +diff --git a/drivers/macintosh/adbhid.c b/drivers/macintosh/adbhid.c +index a261892c03b3..a85192de840c 100644 +--- a/drivers/macintosh/adbhid.c ++++ b/drivers/macintosh/adbhid.c +@@ -262,6 +262,15 @@ static struct adb_ids buttons_ids; + #define ADBMOUSE_MS_A3 8 /* Mouse systems A3 trackball (handler 3) */ + #define ADBMOUSE_MACALLY2 9 /* MacAlly 2-button mouse */ + ++#ifdef CONFIG_ADB_TRACKPAD_ABSOLUTE ++#define ABS_XMIN 310 ++#define ABS_XMAX 1700 ++#define ABS_YMIN 200 ++#define ABS_YMAX 1000 ++#define ABS_ZMIN 0 ++#define ABS_ZMAX 55 ++#endif ++ + static void + adbhid_keyboard_input(unsigned char *data, int nb, int apoll) + { +@@ -405,6 +414,9 @@ static void + adbhid_mouse_input(unsigned char *data, int nb, int autopoll) + { + int id = (data[0] >> 4) & 0x0f; ++#ifdef CONFIG_ADB_TRACKPAD_ABSOLUTE ++ int btn = 0; int x_axis = 0; int y_axis = 0; int z_axis = 0; ++#endif + + if (!adbhid[id]) { + pr_err("ADB HID on ID %d not yet registered\n", id); +@@ -436,6 +448,17 @@ adbhid_mouse_input(unsigned char *data, int nb, int autopoll) + high bits of y-axis motion. XY is additional + high bits of x-axis motion. + ++ For ADB Absolute motion protocol the data array will contain the ++ following values: ++ ++ BITS COMMENTS ++ data[0] = dddd 1100 ADB command: Talk, register 0, for device dddd. ++ data[1] = byyy yyyy Left button and y-axis motion. ++ data[2] = bxxx xxxx Second button and x-axis motion. ++ data[3] = 1yyy 1xxx Half bits of y-axis and x-axis motion. ++ data[4] = 1yyy 1xxx Higher bits of y-axis and x-axis motion. ++ data[5] = 1zzz 1zzz Higher and lower bits of z-pressure. ++ + MacAlly 2-button mouse protocol. + + For MacAlly 2-button mouse protocol the data array will contain the +@@ -458,8 +481,17 @@ adbhid_mouse_input(unsigned char *data, int nb, int autopoll) + switch (adbhid[id]->mouse_kind) + { + case ADBMOUSE_TRACKPAD: ++#ifdef CONFIG_ADB_TRACKPAD_ABSOLUTE ++ x_axis = (data[2] & 0x7f) | ((data[3] & 0x07) << 7) | ++ ((data[4] & 0x07) << 10); ++ y_axis = (data[1] & 0x7f) | ((data[3] & 0x70) << 3) | ++ ((data[4] & 0x70) << 6); ++ z_axis = (data[5] & 0x07) | ((data[5] & 0x70) >> 1); ++ btn = (!(data[1] >> 7)) & 1; ++#else + data[1] = (data[1] & 0x7f) | ((data[1] & data[2]) & 0x80); + data[2] = data[2] | 0x80; ++#endif + break; + case ADBMOUSE_MICROSPEED: + data[1] = (data[1] & 0x7f) | ((data[3] & 0x01) << 7); +@@ -485,17 +517,39 @@ adbhid_mouse_input(unsigned char *data, int nb, int autopoll) + break; + } + +- input_report_key(adbhid[id]->input, BTN_LEFT, !((data[1] >> 7) & 1)); +- input_report_key(adbhid[id]->input, BTN_MIDDLE, !((data[2] >> 7) & 1)); ++#ifdef CONFIG_ADB_TRACKPAD_ABSOLUTE ++ if ( adbhid[id]->mouse_kind == ADBMOUSE_TRACKPAD ) { + +- if (nb >= 4 && adbhid[id]->mouse_kind != ADBMOUSE_TRACKPAD) +- input_report_key(adbhid[id]->input, BTN_RIGHT, !((data[3] >> 7) & 1)); ++ if(z_axis > 30) input_report_key(adbhid[id]->input, BTN_TOUCH, 1); ++ if(z_axis < 25) input_report_key(adbhid[id]->input, BTN_TOUCH, 0); + +- input_report_rel(adbhid[id]->input, REL_X, +- ((data[2]&0x7f) < 64 ? (data[2]&0x7f) : (data[2]&0x7f)-128 )); +- input_report_rel(adbhid[id]->input, REL_Y, +- ((data[1]&0x7f) < 64 ? (data[1]&0x7f) : (data[1]&0x7f)-128 )); ++ if(z_axis > 0){ ++ input_report_abs(adbhid[id]->input, ABS_X, x_axis); ++ input_report_abs(adbhid[id]->input, ABS_Y, y_axis); ++ input_report_key(adbhid[id]->input, BTN_TOOL_FINGER, 1); ++ input_report_key(adbhid[id]->input, ABS_TOOL_WIDTH, 5); ++ } else { ++ input_report_key(adbhid[id]->input, BTN_TOOL_FINGER, 0); ++ input_report_key(adbhid[id]->input, ABS_TOOL_WIDTH, 0); ++ } ++ ++ input_report_abs(adbhid[id]->input, ABS_PRESSURE, z_axis); ++ input_report_key(adbhid[id]->input, BTN_LEFT, btn); ++ } else { ++#endif ++ input_report_key(adbhid[id]->input, BTN_LEFT, !((data[1] >> 7) & 1)); ++ input_report_key(adbhid[id]->input, BTN_MIDDLE, !((data[2] >> 7) & 1)); ++ ++ if (nb >= 4 && adbhid[id]->mouse_kind != ADBMOUSE_TRACKPAD) ++ input_report_key(adbhid[id]->input, BTN_RIGHT, !((data[3] >> 7) & 1)); + ++ input_report_rel(adbhid[id]->input, REL_X, ++ ((data[2]&0x7f) < 64 ? (data[2]&0x7f) : (data[2]&0x7f)-128 )); ++ input_report_rel(adbhid[id]->input, REL_Y, ++ ((data[1]&0x7f) < 64 ? (data[1]&0x7f) : (data[1]&0x7f)-128 )); ++#ifdef CONFIG_ADB_TRACKPAD_ABSOLUTE ++ } ++#endif + input_sync(adbhid[id]->input); + } + +@@ -849,6 +903,15 @@ adbhid_input_register(int id, int default_id, int original_handler_id, + input_dev->keybit[BIT_WORD(BTN_MOUSE)] = BIT_MASK(BTN_LEFT) | + BIT_MASK(BTN_MIDDLE) | BIT_MASK(BTN_RIGHT); + input_dev->relbit[0] = BIT_MASK(REL_X) | BIT_MASK(REL_Y); ++#ifdef CONFIG_ADB_TRACKPAD_ABSOLUTE ++ set_bit(EV_ABS, input_dev->evbit); ++ input_set_abs_params(input_dev, ABS_X, ABS_XMIN, ABS_XMAX, 0, 0); ++ input_set_abs_params(input_dev, ABS_Y, ABS_YMIN, ABS_YMAX, 0, 0); ++ input_set_abs_params(input_dev, ABS_PRESSURE, ABS_ZMIN, ABS_ZMAX, 0, 0); ++ set_bit(BTN_TOUCH, input_dev->keybit); ++ set_bit(BTN_TOOL_FINGER, input_dev->keybit); ++ set_bit(ABS_TOOL_WIDTH, input_dev->absbit); ++#endif + break; + + case ADB_MISC: +@@ -1132,7 +1195,11 @@ init_trackpad(int id) + r1_buffer[3], + r1_buffer[4], + r1_buffer[5], ++#ifdef CONFIG_ADB_TRACKPAD_ABSOLUTE ++ 0x00, /* Enable absolute mode */ ++#else + 0x03, /*r1_buffer[6],*/ ++#endif + r1_buffer[7]); + + /* Without this flush, the trackpad may be locked up */ +diff --git a/drivers/platform/x86/Kconfig b/drivers/platform/x86/Kconfig +index ac4d48830415..b272132ac742 100644 +--- a/drivers/platform/x86/Kconfig ++++ b/drivers/platform/x86/Kconfig +@@ -573,9 +573,28 @@ config THINKPAD_ACPI_HOTKEY_POLL + If you are not sure, say Y here. The driver enables polling only if + it is strictly necessary to do so. + ++config THINKPAD_EC ++ tristate ++ ---help--- ++ This is a low-level driver for accessing the ThinkPad H8S embedded ++ controller over the LPC bus (not to be confused with the ACPI Embedded ++ Controller interface). ++ ++config TP_SMAPI ++ tristate "ThinkPad SMAPI Support" ++ select THINKPAD_EC ++ default n ++ help ++ This adds SMAPI support on Lenovo/IBM ThinkPads, for features such ++ as battery charging control. For more information about this driver ++ see . ++ ++ If you have a Lenovo/IBM ThinkPad laptop, say Y or M here. ++ + config SENSORS_HDAPS + tristate "Thinkpad Hard Drive Active Protection System (hdaps)" + depends on INPUT ++ select THINKPAD_EC + select INPUT_POLLDEV + help + This driver provides support for the IBM Hard Drive Active Protection +diff --git a/drivers/platform/x86/Makefile b/drivers/platform/x86/Makefile +index 2ba6cb795338..399f8b88646f 100644 +--- a/drivers/platform/x86/Makefile ++++ b/drivers/platform/x86/Makefile +@@ -35,6 +35,8 @@ obj-$(CONFIG_TC1100_WMI) += tc1100-wmi.o + obj-$(CONFIG_SONY_LAPTOP) += sony-laptop.o + obj-$(CONFIG_IDEAPAD_LAPTOP) += ideapad-laptop.o + obj-$(CONFIG_THINKPAD_ACPI) += thinkpad_acpi.o ++obj-$(CONFIG_THINKPAD_EC) += thinkpad_ec.o ++obj-$(CONFIG_TP_SMAPI) += tp_smapi.o + obj-$(CONFIG_SENSORS_HDAPS) += hdaps.o + obj-$(CONFIG_FUJITSU_LAPTOP) += fujitsu-laptop.o + obj-$(CONFIG_FUJITSU_TABLET) += fujitsu-tablet.o +diff --git a/drivers/platform/x86/hdaps.c b/drivers/platform/x86/hdaps.c +index c26baf77938e..1814614f240c 100644 +--- a/drivers/platform/x86/hdaps.c ++++ b/drivers/platform/x86/hdaps.c +@@ -2,7 +2,7 @@ + * hdaps.c - driver for IBM's Hard Drive Active Protection System + * + * Copyright (C) 2005 Robert Love +- * Copyright (C) 2005 Jesper Juhl ++ * Copyright (C) 2005 Jesper Juhl + * + * The HardDisk Active Protection System (hdaps) is present in IBM ThinkPads + * starting with the R40, T41, and X40. It provides a basic two-axis +@@ -30,266 +30,384 @@ + + #include + #include +-#include ++#include + #include +-#include + #include + #include + #include + #include +-#include +- +-#define HDAPS_LOW_PORT 0x1600 /* first port used by hdaps */ +-#define HDAPS_NR_PORTS 0x30 /* number of ports: 0x1600 - 0x162f */ +- +-#define HDAPS_PORT_STATE 0x1611 /* device state */ +-#define HDAPS_PORT_YPOS 0x1612 /* y-axis position */ +-#define HDAPS_PORT_XPOS 0x1614 /* x-axis position */ +-#define HDAPS_PORT_TEMP1 0x1616 /* device temperature, in Celsius */ +-#define HDAPS_PORT_YVAR 0x1617 /* y-axis variance (what is this?) */ +-#define HDAPS_PORT_XVAR 0x1619 /* x-axis variance (what is this?) */ +-#define HDAPS_PORT_TEMP2 0x161b /* device temperature (again?) */ +-#define HDAPS_PORT_UNKNOWN 0x161c /* what is this? */ +-#define HDAPS_PORT_KMACT 0x161d /* keyboard or mouse activity */ +- +-#define STATE_FRESH 0x50 /* accelerometer data is fresh */ ++#include ++#include ++#include ++ ++/* Embedded controller accelerometer read command and its result: */ ++static const struct thinkpad_ec_row ec_accel_args = ++ { .mask = 0x0001, .val = {0x11} }; ++#define EC_ACCEL_IDX_READOUTS 0x1 /* readouts included in this read */ ++ /* First readout, if READOUTS>=1: */ ++#define EC_ACCEL_IDX_YPOS1 0x2 /* y-axis position word */ ++#define EC_ACCEL_IDX_XPOS1 0x4 /* x-axis position word */ ++#define EC_ACCEL_IDX_TEMP1 0x6 /* device temperature in Celsius */ ++ /* Second readout, if READOUTS>=2: */ ++#define EC_ACCEL_IDX_XPOS2 0x7 /* y-axis position word */ ++#define EC_ACCEL_IDX_YPOS2 0x9 /* x-axis position word */ ++#define EC_ACCEL_IDX_TEMP2 0xb /* device temperature in Celsius */ ++#define EC_ACCEL_IDX_QUEUED 0xc /* Number of queued readouts left */ ++#define EC_ACCEL_IDX_KMACT 0xd /* keyboard or mouse activity */ ++#define EC_ACCEL_IDX_RETVAL 0xf /* command return value, good=0x00 */ + + #define KEYBD_MASK 0x20 /* set if keyboard activity */ + #define MOUSE_MASK 0x40 /* set if mouse activity */ +-#define KEYBD_ISSET(n) (!! (n & KEYBD_MASK)) /* keyboard used? */ +-#define MOUSE_ISSET(n) (!! (n & MOUSE_MASK)) /* mouse used? */ + +-#define INIT_TIMEOUT_MSECS 4000 /* wait up to 4s for device init ... */ +-#define INIT_WAIT_MSECS 200 /* ... in 200ms increments */ ++#define READ_TIMEOUT_MSECS 100 /* wait this long for device read */ ++#define RETRY_MSECS 3 /* retry delay */ + +-#define HDAPS_POLL_INTERVAL 50 /* poll for input every 1/20s (50 ms)*/ + #define HDAPS_INPUT_FUZZ 4 /* input event threshold */ + #define HDAPS_INPUT_FLAT 4 +- +-#define HDAPS_X_AXIS (1 << 0) +-#define HDAPS_Y_AXIS (1 << 1) +-#define HDAPS_BOTH_AXES (HDAPS_X_AXIS | HDAPS_Y_AXIS) +- ++#define KMACT_REMEMBER_PERIOD (HZ/10) /* keyboard/mouse persistence */ ++ ++/* Input IDs */ ++#define HDAPS_INPUT_VENDOR PCI_VENDOR_ID_IBM ++#define HDAPS_INPUT_PRODUCT 0x5054 /* "TP", shared with thinkpad_acpi */ ++#define HDAPS_INPUT_JS_VERSION 0x6801 /* Joystick emulation input device */ ++#define HDAPS_INPUT_RAW_VERSION 0x4801 /* Raw accelerometer input device */ ++ ++/* Axis orientation. */ ++/* The unnatural bit-representation of inversions is for backward ++ * compatibility with the"invert=1" module parameter. */ ++#define HDAPS_ORIENT_INVERT_XY 0x01 /* Invert both X and Y axes. */ ++#define HDAPS_ORIENT_INVERT_X 0x02 /* Invert the X axis (uninvert if ++ * already inverted by INVERT_XY). */ ++#define HDAPS_ORIENT_SWAP 0x04 /* Swap the axes. The swap occurs ++ * before inverting X or Y. */ ++#define HDAPS_ORIENT_MAX 0x07 ++#define HDAPS_ORIENT_UNDEFINED 0xFF /* Placeholder during initialization */ ++#define HDAPS_ORIENT_INVERT_Y (HDAPS_ORIENT_INVERT_XY | HDAPS_ORIENT_INVERT_X) ++ ++static struct timer_list hdaps_timer; + static struct platform_device *pdev; +-static struct input_polled_dev *hdaps_idev; +-static unsigned int hdaps_invert; +-static u8 km_activity; +-static int rest_x; +-static int rest_y; +- +-static DEFINE_MUTEX(hdaps_mtx); +- +-/* +- * __get_latch - Get the value from a given port. Callers must hold hdaps_mtx. +- */ +-static inline u8 __get_latch(u16 port) ++static struct input_dev *hdaps_idev; /* joystick-like device with fuzz */ ++static struct input_dev *hdaps_idev_raw; /* raw hdaps sensor readouts */ ++static unsigned int hdaps_invert = HDAPS_ORIENT_UNDEFINED; ++static int needs_calibration; ++ ++/* Configuration: */ ++static int sampling_rate = 50; /* Sampling rate */ ++static int oversampling_ratio = 5; /* Ratio between our sampling rate and ++ * EC accelerometer sampling rate */ ++static int running_avg_filter_order = 2; /* EC running average filter order */ ++ ++/* Latest state readout: */ ++static int pos_x, pos_y; /* position */ ++static int temperature; /* temperature */ ++static int stale_readout = 1; /* last read invalid */ ++static int rest_x, rest_y; /* calibrated rest position */ ++ ++/* Last time we saw keyboard and mouse activity: */ ++static u64 last_keyboard_jiffies = INITIAL_JIFFIES; ++static u64 last_mouse_jiffies = INITIAL_JIFFIES; ++static u64 last_update_jiffies = INITIAL_JIFFIES; ++ ++/* input device use count */ ++static int hdaps_users; ++static DEFINE_MUTEX(hdaps_users_mtx); ++ ++/* Some models require an axis transformation to the standard representation */ ++static void transform_axes(int *x, int *y) + { +- return inb(port) & 0xff; ++ if (hdaps_invert & HDAPS_ORIENT_SWAP) { ++ int z; ++ z = *x; ++ *x = *y; ++ *y = z; ++ } ++ if (hdaps_invert & HDAPS_ORIENT_INVERT_XY) { ++ *x = -*x; ++ *y = -*y; ++ } ++ if (hdaps_invert & HDAPS_ORIENT_INVERT_X) ++ *x = -*x; + } + +-/* +- * __check_latch - Check a port latch for a given value. Returns zero if the +- * port contains the given value. Callers must hold hdaps_mtx. ++/** ++ * __hdaps_update - query current state, with locks already acquired ++ * @fast: if nonzero, do one quick attempt without retries. ++ * ++ * Query current accelerometer state and update global state variables. ++ * Also prefetches the next query. Caller must hold controller lock. + */ +-static inline int __check_latch(u16 port, u8 val) ++static int __hdaps_update(int fast) + { +- if (__get_latch(port) == val) +- return 0; +- return -EINVAL; +-} ++ /* Read data: */ ++ struct thinkpad_ec_row data; ++ int ret; + +-/* +- * __wait_latch - Wait up to 100us for a port latch to get a certain value, +- * returning zero if the value is obtained. Callers must hold hdaps_mtx. +- */ +-static int __wait_latch(u16 port, u8 val) +-{ +- unsigned int i; ++ data.mask = (1 << EC_ACCEL_IDX_READOUTS) | (1 << EC_ACCEL_IDX_KMACT) | ++ (3 << EC_ACCEL_IDX_YPOS1) | (3 << EC_ACCEL_IDX_XPOS1) | ++ (1 << EC_ACCEL_IDX_TEMP1) | (1 << EC_ACCEL_IDX_RETVAL); ++ if (fast) ++ ret = thinkpad_ec_try_read_row(&ec_accel_args, &data); ++ else ++ ret = thinkpad_ec_read_row(&ec_accel_args, &data); ++ thinkpad_ec_prefetch_row(&ec_accel_args); /* Prefetch even if error */ ++ if (ret) ++ return ret; + +- for (i = 0; i < 20; i++) { +- if (!__check_latch(port, val)) +- return 0; +- udelay(5); ++ /* Check status: */ ++ if (data.val[EC_ACCEL_IDX_RETVAL] != 0x00) { ++ pr_warn("read RETVAL=0x%02x\n", ++ data.val[EC_ACCEL_IDX_RETVAL]); ++ return -EIO; + } + +- return -EIO; ++ if (data.val[EC_ACCEL_IDX_READOUTS] < 1) ++ return -EBUSY; /* no pending readout, try again later */ ++ ++ /* Parse position data: */ ++ pos_x = *(s16 *)(data.val+EC_ACCEL_IDX_XPOS1); ++ pos_y = *(s16 *)(data.val+EC_ACCEL_IDX_YPOS1); ++ transform_axes(&pos_x, &pos_y); ++ ++ /* Keyboard and mouse activity status is cleared as soon as it's read, ++ * so applications will eat each other's events. Thus we remember any ++ * event for KMACT_REMEMBER_PERIOD jiffies. ++ */ ++ if (data.val[EC_ACCEL_IDX_KMACT] & KEYBD_MASK) ++ last_keyboard_jiffies = get_jiffies_64(); ++ if (data.val[EC_ACCEL_IDX_KMACT] & MOUSE_MASK) ++ last_mouse_jiffies = get_jiffies_64(); ++ ++ temperature = data.val[EC_ACCEL_IDX_TEMP1]; ++ ++ last_update_jiffies = get_jiffies_64(); ++ stale_readout = 0; ++ if (needs_calibration) { ++ rest_x = pos_x; ++ rest_y = pos_y; ++ needs_calibration = 0; ++ } ++ ++ return 0; + } + +-/* +- * __device_refresh - request a refresh from the accelerometer. Does not wait +- * for refresh to complete. Callers must hold hdaps_mtx. ++/** ++ * hdaps_update - acquire locks and query current state ++ * ++ * Query current accelerometer state and update global state variables. ++ * Also prefetches the next query. ++ * Retries until timeout if the accelerometer is not in ready status (common). ++ * Does its own locking. + */ +-static void __device_refresh(void) ++static int hdaps_update(void) + { +- udelay(200); +- if (inb(0x1604) != STATE_FRESH) { +- outb(0x11, 0x1610); +- outb(0x01, 0x161f); ++ u64 age = get_jiffies_64() - last_update_jiffies; ++ int total, ret; ++ ++ if (!stale_readout && age < (9*HZ)/(10*sampling_rate)) ++ return 0; /* already updated recently */ ++ for (total = 0; total < READ_TIMEOUT_MSECS; total += RETRY_MSECS) { ++ ret = thinkpad_ec_lock(); ++ if (ret) ++ return ret; ++ ret = __hdaps_update(0); ++ thinkpad_ec_unlock(); ++ ++ if (!ret) ++ return 0; ++ if (ret != -EBUSY) ++ break; ++ msleep(RETRY_MSECS); + } ++ return ret; + } + +-/* +- * __device_refresh_sync - request a synchronous refresh from the +- * accelerometer. We wait for the refresh to complete. Returns zero if +- * successful and nonzero on error. Callers must hold hdaps_mtx. ++/** ++ * hdaps_set_power - enable or disable power to the accelerometer. ++ * Returns zero on success and negative error code on failure. Can sleep. + */ +-static int __device_refresh_sync(void) ++static int hdaps_set_power(int on) + { +- __device_refresh(); +- return __wait_latch(0x1604, STATE_FRESH); ++ struct thinkpad_ec_row args = ++ { .mask = 0x0003, .val = {0x14, on?0x01:0x00} }; ++ struct thinkpad_ec_row data = { .mask = 0x8000 }; ++ int ret = thinkpad_ec_read_row(&args, &data); ++ if (ret) ++ return ret; ++ if (data.val[0xF] != 0x00) ++ return -EIO; ++ return 0; + } + +-/* +- * __device_complete - indicate to the accelerometer that we are done reading +- * data, and then initiate an async refresh. Callers must hold hdaps_mtx. ++/** ++ * hdaps_set_ec_config - set accelerometer parameters. ++ * @ec_rate: embedded controller sampling rate ++ * @order: embedded controller running average filter order ++ * (Normally we have @ec_rate = sampling_rate * oversampling_ratio.) ++ * Returns zero on success and negative error code on failure. Can sleep. + */ +-static inline void __device_complete(void) ++static int hdaps_set_ec_config(int ec_rate, int order) + { +- inb(0x161f); +- inb(0x1604); +- __device_refresh(); ++ struct thinkpad_ec_row args = { .mask = 0x000F, ++ .val = {0x10, (u8)ec_rate, (u8)(ec_rate>>8), order} }; ++ struct thinkpad_ec_row data = { .mask = 0x8000 }; ++ int ret = thinkpad_ec_read_row(&args, &data); ++ pr_debug("setting ec_rate=%d, filter_order=%d\n", ec_rate, order); ++ if (ret) ++ return ret; ++ if (data.val[0xF] == 0x03) { ++ pr_warn("config param out of range\n"); ++ return -EINVAL; ++ } ++ if (data.val[0xF] == 0x06) { ++ pr_warn("config change already pending\n"); ++ return -EBUSY; ++ } ++ if (data.val[0xF] != 0x00) { ++ pr_warn("config change error, ret=%d\n", ++ data.val[0xF]); ++ return -EIO; ++ } ++ return 0; + } + +-/* +- * hdaps_readb_one - reads a byte from a single I/O port, placing the value in +- * the given pointer. Returns zero on success or a negative error on failure. +- * Can sleep. ++/** ++ * hdaps_get_ec_config - get accelerometer parameters. ++ * @ec_rate: embedded controller sampling rate ++ * @order: embedded controller running average filter order ++ * Returns zero on success and negative error code on failure. Can sleep. + */ +-static int hdaps_readb_one(unsigned int port, u8 *val) ++static int hdaps_get_ec_config(int *ec_rate, int *order) + { +- int ret; +- +- mutex_lock(&hdaps_mtx); +- +- /* do a sync refresh -- we need to be sure that we read fresh data */ +- ret = __device_refresh_sync(); ++ const struct thinkpad_ec_row args = ++ { .mask = 0x0003, .val = {0x17, 0x82} }; ++ struct thinkpad_ec_row data = { .mask = 0x801F }; ++ int ret = thinkpad_ec_read_row(&args, &data); + if (ret) +- goto out; +- +- *val = inb(port); +- __device_complete(); +- +-out: +- mutex_unlock(&hdaps_mtx); +- return ret; ++ return ret; ++ if (data.val[0xF] != 0x00) ++ return -EIO; ++ if (!(data.val[0x1] & 0x01)) ++ return -ENXIO; /* accelerometer polling not enabled */ ++ if (data.val[0x1] & 0x02) ++ return -EBUSY; /* config change in progress, retry later */ ++ *ec_rate = data.val[0x2] | ((int)(data.val[0x3]) << 8); ++ *order = data.val[0x4]; ++ return 0; + } + +-/* __hdaps_read_pair - internal lockless helper for hdaps_read_pair(). */ +-static int __hdaps_read_pair(unsigned int port1, unsigned int port2, +- int *x, int *y) ++/** ++ * hdaps_get_ec_mode - get EC accelerometer mode ++ * Returns zero on success and negative error code on failure. Can sleep. ++ */ ++static int hdaps_get_ec_mode(u8 *mode) + { +- /* do a sync refresh -- we need to be sure that we read fresh data */ +- if (__device_refresh_sync()) ++ const struct thinkpad_ec_row args = ++ { .mask = 0x0001, .val = {0x13} }; ++ struct thinkpad_ec_row data = { .mask = 0x8002 }; ++ int ret = thinkpad_ec_read_row(&args, &data); ++ if (ret) ++ return ret; ++ if (data.val[0xF] != 0x00) { ++ pr_warn("accelerometer not implemented (0x%02x)\n", ++ data.val[0xF]); + return -EIO; +- +- *y = inw(port2); +- *x = inw(port1); +- km_activity = inb(HDAPS_PORT_KMACT); +- __device_complete(); +- +- /* hdaps_invert is a bitvector to negate the axes */ +- if (hdaps_invert & HDAPS_X_AXIS) +- *x = -*x; +- if (hdaps_invert & HDAPS_Y_AXIS) +- *y = -*y; +- ++ } ++ *mode = data.val[0x1]; + return 0; + } + +-/* +- * hdaps_read_pair - reads the values from a pair of ports, placing the values +- * in the given pointers. Returns zero on success. Can sleep. ++/** ++ * hdaps_check_ec - checks something about the EC. ++ * Follows the clean-room spec for HDAPS; we don't know what it means. ++ * Returns zero on success and negative error code on failure. Can sleep. + */ +-static int hdaps_read_pair(unsigned int port1, unsigned int port2, +- int *val1, int *val2) ++static int hdaps_check_ec(void) + { +- int ret; +- +- mutex_lock(&hdaps_mtx); +- ret = __hdaps_read_pair(port1, port2, val1, val2); +- mutex_unlock(&hdaps_mtx); +- +- return ret; ++ const struct thinkpad_ec_row args = ++ { .mask = 0x0003, .val = {0x17, 0x81} }; ++ struct thinkpad_ec_row data = { .mask = 0x800E }; ++ int ret = thinkpad_ec_read_row(&args, &data); ++ if (ret) ++ return ret; ++ if (!((data.val[0x1] == 0x00 && data.val[0x2] == 0x60) || /* cleanroom spec */ ++ (data.val[0x1] == 0x01 && data.val[0x2] == 0x00)) || /* seen on T61 */ ++ data.val[0x3] != 0x00 || data.val[0xF] != 0x00) { ++ pr_warn("hdaps_check_ec: bad response (0x%x,0x%x,0x%x,0x%x)\n", ++ data.val[0x1], data.val[0x2], ++ data.val[0x3], data.val[0xF]); ++ return -EIO; ++ } ++ return 0; + } + +-/* +- * hdaps_device_init - initialize the accelerometer. Returns zero on success +- * and negative error code on failure. Can sleep. ++/** ++ * hdaps_device_init - initialize the accelerometer. ++ * ++ * Call several embedded controller functions to test and initialize the ++ * accelerometer. ++ * Returns zero on success and negative error code on failure. Can sleep. + */ ++#define FAILED_INIT(msg) pr_err("init failed at: %s\n", msg) + static int hdaps_device_init(void) + { +- int total, ret = -ENXIO; ++ int ret; ++ u8 mode; + +- mutex_lock(&hdaps_mtx); ++ ret = thinkpad_ec_lock(); ++ if (ret) ++ return ret; + +- outb(0x13, 0x1610); +- outb(0x01, 0x161f); +- if (__wait_latch(0x161f, 0x00)) +- goto out; ++ if (hdaps_get_ec_mode(&mode)) ++ { FAILED_INIT("hdaps_get_ec_mode failed"); goto bad; } + +- /* +- * Most ThinkPads return 0x01. +- * +- * Others--namely the R50p, T41p, and T42p--return 0x03. These laptops +- * have "inverted" axises. +- * +- * The 0x02 value occurs when the chip has been previously initialized. +- */ +- if (__check_latch(0x1611, 0x03) && +- __check_latch(0x1611, 0x02) && +- __check_latch(0x1611, 0x01)) +- goto out; ++ pr_debug("initial mode latch is 0x%02x\n", mode); ++ if (mode == 0x00) ++ { FAILED_INIT("accelerometer not available"); goto bad; } + +- printk(KERN_DEBUG "hdaps: initial latch check good (0x%02x)\n", +- __get_latch(0x1611)); ++ if (hdaps_check_ec()) ++ { FAILED_INIT("hdaps_check_ec failed"); goto bad; } + +- outb(0x17, 0x1610); +- outb(0x81, 0x1611); +- outb(0x01, 0x161f); +- if (__wait_latch(0x161f, 0x00)) +- goto out; +- if (__wait_latch(0x1611, 0x00)) +- goto out; +- if (__wait_latch(0x1612, 0x60)) +- goto out; +- if (__wait_latch(0x1613, 0x00)) +- goto out; +- outb(0x14, 0x1610); +- outb(0x01, 0x1611); +- outb(0x01, 0x161f); +- if (__wait_latch(0x161f, 0x00)) +- goto out; +- outb(0x10, 0x1610); +- outb(0xc8, 0x1611); +- outb(0x00, 0x1612); +- outb(0x02, 0x1613); +- outb(0x01, 0x161f); +- if (__wait_latch(0x161f, 0x00)) +- goto out; +- if (__device_refresh_sync()) +- goto out; +- if (__wait_latch(0x1611, 0x00)) +- goto out; +- +- /* we have done our dance, now let's wait for the applause */ +- for (total = INIT_TIMEOUT_MSECS; total > 0; total -= INIT_WAIT_MSECS) { +- int x, y; ++ if (hdaps_set_power(1)) ++ { FAILED_INIT("hdaps_set_power failed"); goto bad; } + +- /* a read of the device helps push it into action */ +- __hdaps_read_pair(HDAPS_PORT_XPOS, HDAPS_PORT_YPOS, &x, &y); +- if (!__wait_latch(0x1611, 0x02)) { +- ret = 0; +- break; +- } ++ if (hdaps_set_ec_config(sampling_rate*oversampling_ratio, ++ running_avg_filter_order)) ++ { FAILED_INIT("hdaps_set_ec_config failed"); goto bad; } + +- msleep(INIT_WAIT_MSECS); +- } ++ thinkpad_ec_invalidate(); ++ udelay(200); + +-out: +- mutex_unlock(&hdaps_mtx); ++ /* Just prefetch instead of reading, to avoid ~1sec delay on load */ ++ ret = thinkpad_ec_prefetch_row(&ec_accel_args); ++ if (ret) ++ { FAILED_INIT("initial prefetch failed"); goto bad; } ++ goto good; ++bad: ++ thinkpad_ec_invalidate(); ++ ret = -ENXIO; ++good: ++ stale_readout = 1; ++ thinkpad_ec_unlock(); + return ret; + } + ++/** ++ * hdaps_device_shutdown - power off the accelerometer ++ * Returns nonzero on failure. Can sleep. ++ */ ++static int hdaps_device_shutdown(void) ++{ ++ int ret; ++ ret = hdaps_set_power(0); ++ if (ret) { ++ pr_warn("cannot power off\n"); ++ return ret; ++ } ++ ret = hdaps_set_ec_config(0, 1); ++ if (ret) ++ pr_warn("cannot stop EC sampling\n"); ++ return ret; ++} + + /* Device model stuff */ + +@@ -306,13 +424,29 @@ static int hdaps_probe(struct platform_device *dev) + } + + #ifdef CONFIG_PM_SLEEP ++static int hdaps_suspend(struct device *dev) ++{ ++ /* Don't do hdaps polls until resume re-initializes the sensor. */ ++ del_timer_sync(&hdaps_timer); ++ hdaps_device_shutdown(); /* ignore errors, effect is negligible */ ++ return 0; ++} ++ + static int hdaps_resume(struct device *dev) + { +- return hdaps_device_init(); ++ int ret = hdaps_device_init(); ++ if (ret) ++ return ret; ++ ++ mutex_lock(&hdaps_users_mtx); ++ if (hdaps_users) ++ mod_timer(&hdaps_timer, jiffies + HZ/sampling_rate); ++ mutex_unlock(&hdaps_users_mtx); ++ return 0; + } + #endif + +-static SIMPLE_DEV_PM_OPS(hdaps_pm, NULL, hdaps_resume); ++static SIMPLE_DEV_PM_OPS(hdaps_pm, hdaps_suspend, hdaps_resume); + + static struct platform_driver hdaps_driver = { + .probe = hdaps_probe, +@@ -322,30 +456,51 @@ static struct platform_driver hdaps_driver = { + }, + }; + +-/* +- * hdaps_calibrate - Set our "resting" values. Callers must hold hdaps_mtx. ++/** ++ * hdaps_calibrate - set our "resting" values. ++ * Does its own locking. + */ + static void hdaps_calibrate(void) + { +- __hdaps_read_pair(HDAPS_PORT_XPOS, HDAPS_PORT_YPOS, &rest_x, &rest_y); ++ needs_calibration = 1; ++ hdaps_update(); ++ /* If that fails, the mousedev poll will take care of things later. */ + } + +-static void hdaps_mousedev_poll(struct input_polled_dev *dev) ++/* Timer handler for updating the input device. Runs in softirq context, ++ * so avoid lenghty or blocking operations. ++ */ ++#if LINUX_VERSION_CODE < KERNEL_VERSION(4,15,0) ++static void hdaps_mousedev_poll(unsigned long unused) ++#else ++static void hdaps_mousedev_poll(struct timer_list *unused) ++#endif + { +- struct input_dev *input_dev = dev->input; +- int x, y; ++ int ret; + +- mutex_lock(&hdaps_mtx); ++ stale_readout = 1; + +- if (__hdaps_read_pair(HDAPS_PORT_XPOS, HDAPS_PORT_YPOS, &x, &y)) +- goto out; ++ /* Cannot sleep. Try nonblockingly. If we fail, try again later. */ ++ if (thinkpad_ec_try_lock()) ++ goto keep_active; + +- input_report_abs(input_dev, ABS_X, x - rest_x); +- input_report_abs(input_dev, ABS_Y, y - rest_y); +- input_sync(input_dev); ++ ret = __hdaps_update(1); /* fast update, we're in softirq context */ ++ thinkpad_ec_unlock(); ++ /* Any of "successful", "not yet ready" and "not prefetched"? */ ++ if (ret != 0 && ret != -EBUSY && ret != -ENODATA) { ++ pr_err("poll failed, disabling updates\n"); ++ return; ++ } + +-out: +- mutex_unlock(&hdaps_mtx); ++keep_active: ++ /* Even if we failed now, pos_x,y may have been updated earlier: */ ++ input_report_abs(hdaps_idev, ABS_X, pos_x - rest_x); ++ input_report_abs(hdaps_idev, ABS_Y, pos_y - rest_y); ++ input_sync(hdaps_idev); ++ input_report_abs(hdaps_idev_raw, ABS_X, pos_x); ++ input_report_abs(hdaps_idev_raw, ABS_Y, pos_y); ++ input_sync(hdaps_idev_raw); ++ mod_timer(&hdaps_timer, jiffies + HZ/sampling_rate); + } + + +@@ -354,65 +509,41 @@ static void hdaps_mousedev_poll(struct input_polled_dev *dev) + static ssize_t hdaps_position_show(struct device *dev, + struct device_attribute *attr, char *buf) + { +- int ret, x, y; +- +- ret = hdaps_read_pair(HDAPS_PORT_XPOS, HDAPS_PORT_YPOS, &x, &y); ++ int ret = hdaps_update(); + if (ret) + return ret; +- +- return sprintf(buf, "(%d,%d)\n", x, y); +-} +- +-static ssize_t hdaps_variance_show(struct device *dev, +- struct device_attribute *attr, char *buf) +-{ +- int ret, x, y; +- +- ret = hdaps_read_pair(HDAPS_PORT_XVAR, HDAPS_PORT_YVAR, &x, &y); +- if (ret) +- return ret; +- +- return sprintf(buf, "(%d,%d)\n", x, y); ++ return sprintf(buf, "(%d,%d)\n", pos_x, pos_y); + } + + static ssize_t hdaps_temp1_show(struct device *dev, + struct device_attribute *attr, char *buf) + { +- u8 uninitialized_var(temp); +- int ret; +- +- ret = hdaps_readb_one(HDAPS_PORT_TEMP1, &temp); +- if (ret) +- return ret; +- +- return sprintf(buf, "%u\n", temp); +-} +- +-static ssize_t hdaps_temp2_show(struct device *dev, +- struct device_attribute *attr, char *buf) +-{ +- u8 uninitialized_var(temp); +- int ret; +- +- ret = hdaps_readb_one(HDAPS_PORT_TEMP2, &temp); ++ int ret = hdaps_update(); + if (ret) + return ret; +- +- return sprintf(buf, "%u\n", temp); ++ return sprintf(buf, "%d\n", temperature); + } + + static ssize_t hdaps_keyboard_activity_show(struct device *dev, + struct device_attribute *attr, + char *buf) + { +- return sprintf(buf, "%u\n", KEYBD_ISSET(km_activity)); ++ int ret = hdaps_update(); ++ if (ret) ++ return ret; ++ return sprintf(buf, "%u\n", ++ get_jiffies_64() < last_keyboard_jiffies + KMACT_REMEMBER_PERIOD); + } + + static ssize_t hdaps_mouse_activity_show(struct device *dev, + struct device_attribute *attr, + char *buf) + { +- return sprintf(buf, "%u\n", MOUSE_ISSET(km_activity)); ++ int ret = hdaps_update(); ++ if (ret) ++ return ret; ++ return sprintf(buf, "%u\n", ++ get_jiffies_64() < last_mouse_jiffies + KMACT_REMEMBER_PERIOD); + } + + static ssize_t hdaps_calibrate_show(struct device *dev, +@@ -425,10 +556,7 @@ static ssize_t hdaps_calibrate_store(struct device *dev, + struct device_attribute *attr, + const char *buf, size_t count) + { +- mutex_lock(&hdaps_mtx); + hdaps_calibrate(); +- mutex_unlock(&hdaps_mtx); +- + return count; + } + +@@ -445,7 +573,7 @@ static ssize_t hdaps_invert_store(struct device *dev, + int invert; + + if (sscanf(buf, "%d", &invert) != 1 || +- invert < 0 || invert > HDAPS_BOTH_AXES) ++ invert < 0 || invert > HDAPS_ORIENT_MAX) + return -EINVAL; + + hdaps_invert = invert; +@@ -454,24 +582,128 @@ static ssize_t hdaps_invert_store(struct device *dev, + return count; + } + ++static ssize_t hdaps_sampling_rate_show( ++ struct device *dev, struct device_attribute *attr, char *buf) ++{ ++ return sprintf(buf, "%d\n", sampling_rate); ++} ++ ++static ssize_t hdaps_sampling_rate_store( ++ struct device *dev, struct device_attribute *attr, ++ const char *buf, size_t count) ++{ ++ int rate, ret; ++ if (sscanf(buf, "%d", &rate) != 1 || rate > HZ || rate <= 0) { ++ pr_warn("must have 0ident); +- return 1; +-} +- + /* hdaps_dmi_match_invert - found an inverted match. */ + static int __init hdaps_dmi_match_invert(const struct dmi_system_id *id) + { +- hdaps_invert = (unsigned long)id->driver_data; +- pr_info("inverting axis (%u) readings\n", hdaps_invert); +- return hdaps_dmi_match(id); ++ unsigned int orient = (kernel_ulong_t) id->driver_data; ++ hdaps_invert = orient; ++ pr_info("%s detected, setting orientation %u\n", id->ident, orient); ++ return 1; /* stop enumeration */ + } + +-#define HDAPS_DMI_MATCH_INVERT(vendor, model, axes) { \ ++#define HDAPS_DMI_MATCH_INVERT(vendor, model, orient) { \ + .ident = vendor " " model, \ + .callback = hdaps_dmi_match_invert, \ +- .driver_data = (void *)axes, \ ++ .driver_data = (void *)(orient), \ + .matches = { \ + DMI_MATCH(DMI_BOARD_VENDOR, vendor), \ + DMI_MATCH(DMI_PRODUCT_VERSION, model) \ + } \ + } + +-#define HDAPS_DMI_MATCH_NORMAL(vendor, model) \ +- HDAPS_DMI_MATCH_INVERT(vendor, model, 0) +- +-/* Note that HDAPS_DMI_MATCH_NORMAL("ThinkPad T42") would match +- "ThinkPad T42p", so the order of the entries matters. +- If your ThinkPad is not recognized, please update to latest +- BIOS. This is especially the case for some R52 ThinkPads. */ +-static const struct dmi_system_id hdaps_whitelist[] __initconst = { +- HDAPS_DMI_MATCH_INVERT("IBM", "ThinkPad R50p", HDAPS_BOTH_AXES), +- HDAPS_DMI_MATCH_NORMAL("IBM", "ThinkPad R50"), +- HDAPS_DMI_MATCH_NORMAL("IBM", "ThinkPad R51"), +- HDAPS_DMI_MATCH_NORMAL("IBM", "ThinkPad R52"), +- HDAPS_DMI_MATCH_INVERT("LENOVO", "ThinkPad R61i", HDAPS_BOTH_AXES), +- HDAPS_DMI_MATCH_INVERT("LENOVO", "ThinkPad R61", HDAPS_BOTH_AXES), +- HDAPS_DMI_MATCH_INVERT("IBM", "ThinkPad T41p", HDAPS_BOTH_AXES), +- HDAPS_DMI_MATCH_NORMAL("IBM", "ThinkPad T41"), +- HDAPS_DMI_MATCH_INVERT("IBM", "ThinkPad T42p", HDAPS_BOTH_AXES), +- HDAPS_DMI_MATCH_NORMAL("IBM", "ThinkPad T42"), +- HDAPS_DMI_MATCH_NORMAL("IBM", "ThinkPad T43"), +- HDAPS_DMI_MATCH_INVERT("LENOVO", "ThinkPad T400", HDAPS_BOTH_AXES), +- HDAPS_DMI_MATCH_INVERT("LENOVO", "ThinkPad T60", HDAPS_BOTH_AXES), +- HDAPS_DMI_MATCH_INVERT("LENOVO", "ThinkPad T61p", HDAPS_BOTH_AXES), +- HDAPS_DMI_MATCH_INVERT("LENOVO", "ThinkPad T61", HDAPS_BOTH_AXES), +- HDAPS_DMI_MATCH_NORMAL("IBM", "ThinkPad X40"), +- HDAPS_DMI_MATCH_INVERT("IBM", "ThinkPad X41", HDAPS_Y_AXIS), +- HDAPS_DMI_MATCH_INVERT("LENOVO", "ThinkPad X60", HDAPS_BOTH_AXES), +- HDAPS_DMI_MATCH_INVERT("LENOVO", "ThinkPad X61s", HDAPS_BOTH_AXES), +- HDAPS_DMI_MATCH_INVERT("LENOVO", "ThinkPad X61", HDAPS_BOTH_AXES), +- HDAPS_DMI_MATCH_NORMAL("IBM", "ThinkPad Z60m"), +- HDAPS_DMI_MATCH_INVERT("LENOVO", "ThinkPad Z61m", HDAPS_BOTH_AXES), +- HDAPS_DMI_MATCH_INVERT("LENOVO", "ThinkPad Z61p", HDAPS_BOTH_AXES), ++/* List of models with abnormal axis configuration. ++ Note that HDAPS_DMI_MATCH_NORMAL("ThinkPad T42") would match ++ "ThinkPad T42p", and enumeration stops after first match, ++ so the order of the entries matters. */ ++const struct dmi_system_id hdaps_whitelist[] __initconst = { ++ HDAPS_DMI_MATCH_INVERT("IBM", "ThinkPad R50p", HDAPS_ORIENT_INVERT_XY), ++ HDAPS_DMI_MATCH_INVERT("IBM", "ThinkPad R60", HDAPS_ORIENT_INVERT_XY), ++ HDAPS_DMI_MATCH_INVERT("IBM", "ThinkPad T41p", HDAPS_ORIENT_INVERT_XY), ++ HDAPS_DMI_MATCH_INVERT("IBM", "ThinkPad T42p", HDAPS_ORIENT_INVERT_XY), ++ HDAPS_DMI_MATCH_INVERT("IBM", "ThinkPad X40", HDAPS_ORIENT_INVERT_Y), ++ HDAPS_DMI_MATCH_INVERT("IBM", "ThinkPad X41", HDAPS_ORIENT_INVERT_Y), ++ HDAPS_DMI_MATCH_INVERT("LENOVO", "ThinkPad R60", HDAPS_ORIENT_INVERT_XY), ++ HDAPS_DMI_MATCH_INVERT("LENOVO", "ThinkPad R61", HDAPS_ORIENT_INVERT_XY), ++ HDAPS_DMI_MATCH_INVERT("LENOVO", "ThinkPad R400", HDAPS_ORIENT_INVERT_XY), ++ HDAPS_DMI_MATCH_INVERT("LENOVO", "ThinkPad R500", HDAPS_ORIENT_INVERT_XY), ++ HDAPS_DMI_MATCH_INVERT("LENOVO", "ThinkPad T60", HDAPS_ORIENT_INVERT_XY), ++ HDAPS_DMI_MATCH_INVERT("LENOVO", "ThinkPad T61", HDAPS_ORIENT_INVERT_XY), ++ HDAPS_DMI_MATCH_INVERT("LENOVO", "ThinkPad X60 Tablet", HDAPS_ORIENT_INVERT_Y), ++ HDAPS_DMI_MATCH_INVERT("LENOVO", "ThinkPad X60s", HDAPS_ORIENT_INVERT_Y), ++ HDAPS_DMI_MATCH_INVERT("LENOVO", "ThinkPad X60", HDAPS_ORIENT_SWAP | HDAPS_ORIENT_INVERT_X), ++ HDAPS_DMI_MATCH_INVERT("LENOVO", "ThinkPad X61", HDAPS_ORIENT_SWAP | HDAPS_ORIENT_INVERT_X), ++ HDAPS_DMI_MATCH_INVERT("LENOVO", "ThinkPad T400s", HDAPS_ORIENT_INVERT_X), ++ HDAPS_DMI_MATCH_INVERT("LENOVO", "ThinkPad T400", HDAPS_ORIENT_INVERT_XY), ++ HDAPS_DMI_MATCH_INVERT("LENOVO", "ThinkPad T410s", HDAPS_ORIENT_SWAP), ++ HDAPS_DMI_MATCH_INVERT("LENOVO", "ThinkPad T410", HDAPS_ORIENT_INVERT_XY), ++ HDAPS_DMI_MATCH_INVERT("LENOVO", "ThinkPad T500", HDAPS_ORIENT_INVERT_XY), ++ HDAPS_DMI_MATCH_INVERT("LENOVO", "ThinkPad T510", HDAPS_ORIENT_SWAP | HDAPS_ORIENT_INVERT_X | HDAPS_ORIENT_INVERT_Y), ++ HDAPS_DMI_MATCH_INVERT("LENOVO", "ThinkPad W510", HDAPS_ORIENT_MAX), ++ HDAPS_DMI_MATCH_INVERT("LENOVO", "ThinkPad W520", HDAPS_ORIENT_MAX), ++ HDAPS_DMI_MATCH_INVERT("LENOVO", "ThinkPad X200s", HDAPS_ORIENT_SWAP | HDAPS_ORIENT_INVERT_XY), ++ HDAPS_DMI_MATCH_INVERT("LENOVO", "ThinkPad X200", HDAPS_ORIENT_SWAP | HDAPS_ORIENT_INVERT_X | HDAPS_ORIENT_INVERT_Y), ++ HDAPS_DMI_MATCH_INVERT("LENOVO", "ThinkPad X201 Tablet", HDAPS_ORIENT_SWAP | HDAPS_ORIENT_INVERT_XY), ++ HDAPS_DMI_MATCH_INVERT("LENOVO", "ThinkPad X201s", HDAPS_ORIENT_SWAP | HDAPS_ORIENT_INVERT_XY), ++ HDAPS_DMI_MATCH_INVERT("LENOVO", "ThinkPad X201", HDAPS_ORIENT_SWAP | HDAPS_ORIENT_INVERT_X), ++ HDAPS_DMI_MATCH_INVERT("LENOVO", "ThinkPad X220", HDAPS_ORIENT_SWAP), + { .ident = NULL } + }; + + static int __init hdaps_init(void) + { +- struct input_dev *idev; + int ret; + +- if (!dmi_check_system(hdaps_whitelist)) { +- pr_warn("supported laptop not found!\n"); +- ret = -ENODEV; +- goto out; +- } +- +- if (!request_region(HDAPS_LOW_PORT, HDAPS_NR_PORTS, "hdaps")) { +- ret = -ENXIO; +- goto out; +- } +- ++ /* Determine axis orientation orientation */ ++ if (hdaps_invert == HDAPS_ORIENT_UNDEFINED) /* set by module param? */ ++ if (dmi_check_system(hdaps_whitelist) < 1) /* in whitelist? */ ++ hdaps_invert = 0; /* default */ ++ ++ /* Init timer before platform_driver_register, in case of suspend */ ++#if LINUX_VERSION_CODE < KERNEL_VERSION(4,15,0) ++ init_timer(&hdaps_timer); ++ hdaps_timer.function = hdaps_mousedev_poll; ++#else ++ timer_setup(&hdaps_timer, hdaps_mousedev_poll, 0); ++#endif + ret = platform_driver_register(&hdaps_driver); + if (ret) +- goto out_region; ++ goto out; + + pdev = platform_device_register_simple("hdaps", -1, NULL, 0); + if (IS_ERR(pdev)) { +@@ -571,47 +801,79 @@ static int __init hdaps_init(void) + if (ret) + goto out_device; + +- hdaps_idev = input_allocate_polled_device(); ++ hdaps_idev = input_allocate_device(); + if (!hdaps_idev) { + ret = -ENOMEM; + goto out_group; + } + +- hdaps_idev->poll = hdaps_mousedev_poll; +- hdaps_idev->poll_interval = HDAPS_POLL_INTERVAL; +- +- /* initial calibrate for the input device */ +- hdaps_calibrate(); ++ hdaps_idev_raw = input_allocate_device(); ++ if (!hdaps_idev_raw) { ++ ret = -ENOMEM; ++ goto out_idev_first; ++ } + +- /* initialize the input class */ +- idev = hdaps_idev->input; +- idev->name = "hdaps"; +- idev->phys = "isa1600/input0"; +- idev->id.bustype = BUS_ISA; +- idev->dev.parent = &pdev->dev; +- idev->evbit[0] = BIT_MASK(EV_ABS); +- input_set_abs_params(idev, ABS_X, ++ /* calibration for the input device (deferred to avoid delay) */ ++ needs_calibration = 1; ++ ++ /* initialize the joystick-like fuzzed input device */ ++ hdaps_idev->name = "ThinkPad HDAPS joystick emulation"; ++ hdaps_idev->phys = "hdaps/input0"; ++ hdaps_idev->id.bustype = BUS_HOST; ++ hdaps_idev->id.vendor = HDAPS_INPUT_VENDOR; ++ hdaps_idev->id.product = HDAPS_INPUT_PRODUCT; ++ hdaps_idev->id.version = HDAPS_INPUT_JS_VERSION; ++#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,25) ++ hdaps_idev->cdev.dev = &pdev->dev; ++#endif ++ hdaps_idev->evbit[0] = BIT(EV_ABS); ++ hdaps_idev->open = hdaps_mousedev_open; ++ hdaps_idev->close = hdaps_mousedev_close; ++ input_set_abs_params(hdaps_idev, ABS_X, + -256, 256, HDAPS_INPUT_FUZZ, HDAPS_INPUT_FLAT); +- input_set_abs_params(idev, ABS_Y, ++ input_set_abs_params(hdaps_idev, ABS_Y, + -256, 256, HDAPS_INPUT_FUZZ, HDAPS_INPUT_FLAT); + +- ret = input_register_polled_device(hdaps_idev); ++ ret = input_register_device(hdaps_idev); + if (ret) + goto out_idev; + +- pr_info("driver successfully loaded\n"); ++ /* initialize the raw data input device */ ++ hdaps_idev_raw->name = "ThinkPad HDAPS accelerometer data"; ++ hdaps_idev_raw->phys = "hdaps/input1"; ++ hdaps_idev_raw->id.bustype = BUS_HOST; ++ hdaps_idev_raw->id.vendor = HDAPS_INPUT_VENDOR; ++ hdaps_idev_raw->id.product = HDAPS_INPUT_PRODUCT; ++ hdaps_idev_raw->id.version = HDAPS_INPUT_RAW_VERSION; ++#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,25) ++ hdaps_idev_raw->cdev.dev = &pdev->dev; ++#endif ++ hdaps_idev_raw->evbit[0] = BIT(EV_ABS); ++ hdaps_idev_raw->open = hdaps_mousedev_open; ++ hdaps_idev_raw->close = hdaps_mousedev_close; ++ input_set_abs_params(hdaps_idev_raw, ABS_X, -32768, 32767, 0, 0); ++ input_set_abs_params(hdaps_idev_raw, ABS_Y, -32768, 32767, 0, 0); ++ ++ ret = input_register_device(hdaps_idev_raw); ++ if (ret) ++ goto out_idev_reg_first; ++ ++ pr_info("driver successfully loaded.\n"); + return 0; + ++out_idev_reg_first: ++ input_unregister_device(hdaps_idev); + out_idev: +- input_free_polled_device(hdaps_idev); ++ input_free_device(hdaps_idev_raw); ++out_idev_first: ++ input_free_device(hdaps_idev); + out_group: + sysfs_remove_group(&pdev->dev.kobj, &hdaps_attribute_group); + out_device: + platform_device_unregister(pdev); + out_driver: + platform_driver_unregister(&hdaps_driver); +-out_region: +- release_region(HDAPS_LOW_PORT, HDAPS_NR_PORTS); ++ hdaps_device_shutdown(); + out: + pr_warn("driver init failed (ret=%d)!\n", ret); + return ret; +@@ -619,12 +881,12 @@ static int __init hdaps_init(void) + + static void __exit hdaps_exit(void) + { +- input_unregister_polled_device(hdaps_idev); +- input_free_polled_device(hdaps_idev); ++ input_unregister_device(hdaps_idev_raw); ++ input_unregister_device(hdaps_idev); ++ hdaps_device_shutdown(); /* ignore errors, effect is negligible */ + sysfs_remove_group(&pdev->dev.kobj, &hdaps_attribute_group); + platform_device_unregister(pdev); + platform_driver_unregister(&hdaps_driver); +- release_region(HDAPS_LOW_PORT, HDAPS_NR_PORTS); + + pr_info("driver unloaded\n"); + } +@@ -632,9 +894,8 @@ static void __exit hdaps_exit(void) + module_init(hdaps_init); + module_exit(hdaps_exit); + +-module_param_named(invert, hdaps_invert, int, 0); +-MODULE_PARM_DESC(invert, "invert data along each axis. 1 invert x-axis, " +- "2 invert y-axis, 3 invert both axes."); ++module_param_named(invert, hdaps_invert, uint, 0); ++MODULE_PARM_DESC(invert, "axis orientation code"); + + MODULE_AUTHOR("Robert Love"); + MODULE_DESCRIPTION("IBM Hard Drive Active Protection System (HDAPS) driver"); +diff --git a/drivers/platform/x86/thinkpad_ec.c b/drivers/platform/x86/thinkpad_ec.c +new file mode 100644 +index 000000000000..597614bc17e6 +--- /dev/null ++++ b/drivers/platform/x86/thinkpad_ec.c +@@ -0,0 +1,513 @@ ++/* ++ * thinkpad_ec.c - ThinkPad embedded controller LPC3 functions ++ * ++ * The embedded controller on ThinkPad laptops has a non-standard interface, ++ * where LPC channel 3 of the H8S EC chip is hooked up to IO ports ++ * 0x1600-0x161F and implements (a special case of) the H8S LPC protocol. ++ * The EC LPC interface provides various system management services (currently ++ * known: battery information and accelerometer readouts). This driver ++ * provides access and mutual exclusion for the EC interface. ++* ++ * The LPC protocol and terminology are documented here: ++ * "H8S/2104B Group Hardware Manual", ++ * http://documentation.renesas.com/eng/products/mpumcu/rej09b0300_2140bhm.pdf ++ * ++ * Copyright (C) 2006-2007 Shem Multinymous ++ * ++ * This program is free software; you can redistribute it and/or modify ++ * it under the terms of the GNU General Public License as published by ++ * the Free Software Foundation; either version 2 of the License, or ++ * (at your option) any later version. ++ * ++ * This program is distributed in the hope that it will be useful, ++ * but WITHOUT ANY WARRANTY; without even the implied warranty of ++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ++ * GNU General Public License for more details. ++ * ++ * You should have received a copy of the GNU General Public License ++ * along with this program; if not, write to the Free Software ++ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA ++ */ ++ ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++ ++#include ++#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,26) ++ #include ++#else ++ #include ++#endif ++ ++#define TP_VERSION "0.42" ++ ++MODULE_AUTHOR("Shem Multinymous"); ++MODULE_DESCRIPTION("ThinkPad embedded controller hardware access"); ++MODULE_VERSION(TP_VERSION); ++MODULE_LICENSE("GPL"); ++ ++/* IO ports used by embedded controller LPC channel 3: */ ++#define TPC_BASE_PORT 0x1600 ++#define TPC_NUM_PORTS 0x20 ++#define TPC_STR3_PORT 0x1604 /* Reads H8S EC register STR3 */ ++#define TPC_TWR0_PORT 0x1610 /* Mapped to H8S EC register TWR0MW/SW */ ++#define TPC_TWR15_PORT 0x161F /* Mapped to H8S EC register TWR15. */ ++ /* (and port TPC_TWR0_PORT+i is mapped to H8S reg TWRi for 00x%02x", \ ++ msg, args->val[0x0], args->val[0xF], code) ++ ++/* State of request prefetching: */ ++static u8 prefetch_arg0, prefetch_argF; /* Args of last prefetch */ ++static u64 prefetch_jiffies; /* time of prefetch, or: */ ++#define TPC_PREFETCH_NONE INITIAL_JIFFIES /* No prefetch */ ++#define TPC_PREFETCH_JUNK (INITIAL_JIFFIES+1) /* Ignore prefetch */ ++ ++/* Locking: */ ++#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,37) ++static DECLARE_MUTEX(thinkpad_ec_mutex); ++#else ++static DEFINE_SEMAPHORE(thinkpad_ec_mutex); ++#endif ++ ++/* Kludge in case the ACPI DSDT reserves the ports we need. */ ++static bool force_io; /* Willing to do IO to ports we couldn't reserve? */ ++static int reserved_io; /* Successfully reserved the ports? */ ++module_param_named(force_io, force_io, bool, 0600); ++MODULE_PARM_DESC(force_io, "Force IO even if region already reserved (0=off, 1=on)"); ++ ++/** ++ * thinkpad_ec_lock - get lock on the ThinkPad EC ++ * ++ * Get exclusive lock for accesing the ThinkPad embedded controller LPC3 ++ * interface. Returns 0 iff lock acquired. ++ */ ++int thinkpad_ec_lock(void) ++{ ++ int ret; ++ ret = down_interruptible(&thinkpad_ec_mutex); ++ return ret; ++} ++EXPORT_SYMBOL_GPL(thinkpad_ec_lock); ++ ++/** ++ * thinkpad_ec_try_lock - try getting lock on the ThinkPad EC ++ * ++ * Try getting an exclusive lock for accesing the ThinkPad embedded ++ * controller LPC3. Returns immediately if lock is not available; neither ++ * blocks nor sleeps. Returns 0 iff lock acquired . ++ */ ++int thinkpad_ec_try_lock(void) ++{ ++ return down_trylock(&thinkpad_ec_mutex); ++} ++EXPORT_SYMBOL_GPL(thinkpad_ec_try_lock); ++ ++/** ++ * thinkpad_ec_unlock - release lock on ThinkPad EC ++ * ++ * Release a previously acquired exclusive lock on the ThinkPad ebmedded ++ * controller LPC3 interface. ++ */ ++void thinkpad_ec_unlock(void) ++{ ++ up(&thinkpad_ec_mutex); ++} ++EXPORT_SYMBOL_GPL(thinkpad_ec_unlock); ++ ++/** ++ * thinkpad_ec_request_row - tell embedded controller to prepare a row ++ * @args Input register arguments ++ * ++ * Requests a data row by writing to H8S LPC registers TRW0 through TWR15 (or ++ * a subset thereof) following the protocol prescribed by the "H8S/2104B Group ++ * Hardware Manual". Does sanity checks via status register STR3. ++ */ ++static int thinkpad_ec_request_row(const struct thinkpad_ec_row *args) ++{ ++ u8 str3; ++ int i; ++ ++ /* EC protocol requires write to TWR0 (function code): */ ++ if (!(args->mask & 0x0001)) { ++ printk(KERN_ERR MSG_FMT("bad args->mask=0x%02x", args->mask)); ++ return -EINVAL; ++ } ++ ++ /* Check initial STR3 status: */ ++ str3 = inb(TPC_STR3_PORT) & H8S_STR3_MASK; ++ if (str3 & H8S_STR3_OBF3B) { /* data already pending */ ++ inb(TPC_TWR15_PORT); /* marks end of previous transaction */ ++ if (prefetch_jiffies == TPC_PREFETCH_NONE) ++ printk(KERN_WARNING REQ_FMT( ++ "EC has result from unrequested transaction", ++ str3)); ++ return -EBUSY; /* EC will be ready in a few usecs */ ++ } else if (str3 == H8S_STR3_SWMF) { /* busy with previous request */ ++ if (prefetch_jiffies == TPC_PREFETCH_NONE) ++ printk(KERN_WARNING REQ_FMT( ++ "EC is busy with unrequested transaction", ++ str3)); ++ return -EBUSY; /* data will be pending in a few usecs */ ++ } else if (str3 != 0x00) { /* unexpected status? */ ++ printk(KERN_WARNING REQ_FMT("unexpected initial STR3", str3)); ++ return -EIO; ++ } ++ ++ /* Send TWR0MW: */ ++ outb(args->val[0], TPC_TWR0_PORT); ++ str3 = inb(TPC_STR3_PORT) & H8S_STR3_MASK; ++ if (str3 != H8S_STR3_MWMF) { /* not accepted? */ ++ printk(KERN_WARNING REQ_FMT("arg0 rejected", str3)); ++ return -EIO; ++ } ++ ++ /* Send TWR1 through TWR14: */ ++ for (i = 1; i < TP_CONTROLLER_ROW_LEN-1; i++) ++ if ((args->mask>>i)&1) ++ outb(args->val[i], TPC_TWR0_PORT+i); ++ ++ /* Send TWR15 (default to 0x01). This marks end of command. */ ++ outb((args->mask & 0x8000) ? args->val[0xF] : 0x01, TPC_TWR15_PORT); ++ ++ /* Wait until EC starts writing its reply (~60ns on average). ++ * Releasing locks before this happens may cause an EC hang ++ * due to firmware bug! ++ */ ++ for (i = 0; i < TPC_REQUEST_RETRIES; i++) { ++ str3 = inb(TPC_STR3_PORT) & H8S_STR3_MASK; ++ if (str3 & H8S_STR3_SWMF) /* EC started replying */ ++ return 0; ++ else if (!(str3 & ~(H8S_STR3_IBF3B|H8S_STR3_MWMF))) ++ /* Normal progress (the EC hasn't seen the request ++ * yet, or is processing it). Wait it out. */ ++ ndelay(TPC_REQUEST_NDELAY); ++ else { /* weird EC status */ ++ printk(KERN_WARNING ++ REQ_FMT("bad end STR3", str3)); ++ return -EIO; ++ } ++ } ++ printk(KERN_WARNING REQ_FMT("EC is mysteriously silent", str3)); ++ return -EIO; ++} ++ ++/** ++ * thinkpad_ec_read_data - read pre-requested row-data from EC ++ * @args Input register arguments of pre-requested rows ++ * @data Output register values ++ * ++ * Reads current row data from the controller, assuming it's already ++ * requested. Follows the H8S spec for register access and status checks. ++ */ ++static int thinkpad_ec_read_data(const struct thinkpad_ec_row *args, ++ struct thinkpad_ec_row *data) ++{ ++ int i; ++ u8 str3 = inb(TPC_STR3_PORT) & H8S_STR3_MASK; ++ /* Once we make a request, STR3 assumes the sequence of values listed ++ * in the following 'if' as it reads the request and writes its data. ++ * It takes about a few dozen nanosecs total, with very high variance. ++ */ ++ if (str3 == (H8S_STR3_IBF3B|H8S_STR3_MWMF) || ++ str3 == 0x00 || /* the 0x00 is indistinguishable from idle EC! */ ++ str3 == H8S_STR3_SWMF) ++ return -EBUSY; /* not ready yet */ ++ /* Finally, the EC signals output buffer full: */ ++ if (str3 != (H8S_STR3_OBF3B|H8S_STR3_SWMF)) { ++ printk(KERN_WARNING ++ REQ_FMT("bad initial STR3", str3)); ++ return -EIO; ++ } ++ ++ /* Read first byte (signals start of read transactions): */ ++ data->val[0] = inb(TPC_TWR0_PORT); ++ /* Optionally read 14 more bytes: */ ++ for (i = 1; i < TP_CONTROLLER_ROW_LEN-1; i++) ++ if ((data->mask >> i)&1) ++ data->val[i] = inb(TPC_TWR0_PORT+i); ++ /* Read last byte from 0x161F (signals end of read transaction): */ ++ data->val[0xF] = inb(TPC_TWR15_PORT); ++ ++ /* Readout still pending? */ ++ str3 = inb(TPC_STR3_PORT) & H8S_STR3_MASK; ++ if (str3 & H8S_STR3_OBF3B) ++ printk(KERN_WARNING ++ REQ_FMT("OBF3B=1 after read", str3)); ++ /* If port 0x161F returns 0x80 too often, the EC may lock up. Warn: */ ++ if (data->val[0xF] == 0x80) ++ printk(KERN_WARNING ++ REQ_FMT("0x161F reports error", data->val[0xF])); ++ return 0; ++} ++ ++/** ++ * thinkpad_ec_is_row_fetched - is the given row currently prefetched? ++ * ++ * To keep things simple we compare only the first and last args; ++ * this suffices for all known cases. ++ */ ++static int thinkpad_ec_is_row_fetched(const struct thinkpad_ec_row *args) ++{ ++ return (prefetch_jiffies != TPC_PREFETCH_NONE) && ++ (prefetch_jiffies != TPC_PREFETCH_JUNK) && ++ (prefetch_arg0 == args->val[0]) && ++ (prefetch_argF == args->val[0xF]) && ++ (get_jiffies_64() < prefetch_jiffies + TPC_PREFETCH_TIMEOUT); ++} ++ ++/** ++ * thinkpad_ec_read_row - request and read data from ThinkPad EC ++ * @args Input register arguments ++ * @data Output register values ++ * ++ * Read a data row from the ThinkPad embedded controller LPC3 interface. ++ * Does fetching and retrying if needed. The row is specified by an ++ * array of 16 bytes, some of which may be undefined (but the first is ++ * mandatory). These bytes are given in @args->val[], where @args->val[i] is ++ * used iff (@args->mask>>i)&1). The resulting row data is stored in ++ * @data->val[], but is only guaranteed to be valid for indices corresponding ++ * to set bit in @data->mask. That is, if @data->mask&(1<val[i] is undefined. ++ * ++ * Returns -EBUSY on transient error and -EIO on abnormal condition. ++ * Caller must hold controller lock. ++ */ ++int thinkpad_ec_read_row(const struct thinkpad_ec_row *args, ++ struct thinkpad_ec_row *data) ++{ ++ int retries, ret; ++ ++ if (thinkpad_ec_is_row_fetched(args)) ++ goto read_row; /* already requested */ ++ ++ /* Request the row */ ++ for (retries = 0; retries < TPC_READ_RETRIES; ++retries) { ++ ret = thinkpad_ec_request_row(args); ++ if (!ret) ++ goto read_row; ++ if (ret != -EBUSY) ++ break; ++ ndelay(TPC_READ_NDELAY); ++ } ++ printk(KERN_ERR REQ_FMT("failed requesting row", ret)); ++ goto out; ++ ++read_row: ++ /* Read the row's data */ ++ for (retries = 0; retries < TPC_READ_RETRIES; ++retries) { ++ ret = thinkpad_ec_read_data(args, data); ++ if (!ret) ++ goto out; ++ if (ret != -EBUSY) ++ break; ++ ndelay(TPC_READ_NDELAY); ++ } ++ ++ printk(KERN_ERR REQ_FMT("failed waiting for data", ret)); ++ ++out: ++ prefetch_jiffies = TPC_PREFETCH_JUNK; ++ return ret; ++} ++EXPORT_SYMBOL_GPL(thinkpad_ec_read_row); ++ ++/** ++ * thinkpad_ec_try_read_row - try reading prefetched data from ThinkPad EC ++ * @args Input register arguments ++ * @data Output register values ++ * ++ * Try reading a data row from the ThinkPad embedded controller LPC3 ++ * interface, if this raw was recently prefetched using ++ * thinkpad_ec_prefetch_row(). Does not fetch, retry or block. ++ * The parameters have the same meaning as in thinkpad_ec_read_row(). ++ * ++ * Returns -EBUSY is data not ready and -ENODATA if row not prefetched. ++ * Caller must hold controller lock. ++ */ ++int thinkpad_ec_try_read_row(const struct thinkpad_ec_row *args, ++ struct thinkpad_ec_row *data) ++{ ++ int ret; ++ if (!thinkpad_ec_is_row_fetched(args)) { ++ ret = -ENODATA; ++ } else { ++ ret = thinkpad_ec_read_data(args, data); ++ if (!ret) ++ prefetch_jiffies = TPC_PREFETCH_NONE; /* eaten up */ ++ } ++ return ret; ++} ++EXPORT_SYMBOL_GPL(thinkpad_ec_try_read_row); ++ ++/** ++ * thinkpad_ec_prefetch_row - prefetch data from ThinkPad EC ++ * @args Input register arguments ++ * ++ * Prefetch a data row from the ThinkPad embedded controller LCP3 ++ * interface. A subsequent call to thinkpad_ec_read_row() with the ++ * same arguments will be faster, and a subsequent call to ++ * thinkpad_ec_try_read_row() stands a good chance of succeeding if ++ * done neither too soon nor too late. See ++ * thinkpad_ec_read_row() for the meaning of @args. ++ * ++ * Returns -EBUSY on transient error and -EIO on abnormal condition. ++ * Caller must hold controller lock. ++ */ ++int thinkpad_ec_prefetch_row(const struct thinkpad_ec_row *args) ++{ ++ int ret; ++ ret = thinkpad_ec_request_row(args); ++ if (ret) { ++ prefetch_jiffies = TPC_PREFETCH_JUNK; ++ } else { ++ prefetch_jiffies = get_jiffies_64(); ++ prefetch_arg0 = args->val[0x0]; ++ prefetch_argF = args->val[0xF]; ++ } ++ return ret; ++} ++EXPORT_SYMBOL_GPL(thinkpad_ec_prefetch_row); ++ ++/** ++ * thinkpad_ec_invalidate - invalidate prefetched ThinkPad EC data ++ * ++ * Invalidate the data prefetched via thinkpad_ec_prefetch_row() from the ++ * ThinkPad embedded controller LPC3 interface. ++ * Must be called before unlocking by any code that accesses the controller ++ * ports directly. ++ */ ++void thinkpad_ec_invalidate(void) ++{ ++ prefetch_jiffies = TPC_PREFETCH_JUNK; ++} ++EXPORT_SYMBOL_GPL(thinkpad_ec_invalidate); ++ ++ ++/*** Checking for EC hardware ***/ ++ ++/** ++ * thinkpad_ec_test - verify the EC is present and follows protocol ++ * ++ * Ensure the EC LPC3 channel really works on this machine by making ++ * an EC request and seeing if the EC follows the documented H8S protocol. ++ * The requested row just reads battery status, so it should be harmless to ++ * access it (on a correct EC). ++ * This test writes to IO ports, so execute only after checking DMI. ++ */ ++static int __init thinkpad_ec_test(void) ++{ ++ int ret; ++ const struct thinkpad_ec_row args = /* battery 0 basic status */ ++ { .mask = 0x8001, .val = {0x01,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0x00} }; ++ struct thinkpad_ec_row data = { .mask = 0x0000 }; ++ ret = thinkpad_ec_lock(); ++ if (ret) ++ return ret; ++ ret = thinkpad_ec_read_row(&args, &data); ++ thinkpad_ec_unlock(); ++ return ret; ++} ++ ++/* Search all DMI device names of a given type for a substring */ ++static int __init dmi_find_substring(int type, const char *substr) ++{ ++ const struct dmi_device *dev = NULL; ++ while ((dev = dmi_find_device(type, NULL, dev))) { ++ if (strstr(dev->name, substr)) ++ return 1; ++ } ++ return 0; ++} ++ ++#define TP_DMI_MATCH(vendor,model) { \ ++ .ident = vendor " " model, \ ++ .matches = { \ ++ DMI_MATCH(DMI_BOARD_VENDOR, vendor), \ ++ DMI_MATCH(DMI_PRODUCT_VERSION, model) \ ++ } \ ++} ++ ++/* Check DMI for existence of ThinkPad embedded controller */ ++static int __init check_dmi_for_ec(void) ++{ ++ /* A few old models that have a good EC but don't report it in DMI */ ++ struct dmi_system_id tp_whitelist[] = { ++ TP_DMI_MATCH("IBM", "ThinkPad A30"), ++ TP_DMI_MATCH("IBM", "ThinkPad T23"), ++ TP_DMI_MATCH("IBM", "ThinkPad X24"), ++ TP_DMI_MATCH("LENOVO", "ThinkPad"), ++ { .ident = NULL } ++ }; ++ return dmi_find_substring(DMI_DEV_TYPE_OEM_STRING, ++ "IBM ThinkPad Embedded Controller") || ++ dmi_check_system(tp_whitelist); ++} ++ ++/*** Init and cleanup ***/ ++ ++static int __init thinkpad_ec_init(void) ++{ ++ if (!check_dmi_for_ec()) { ++ printk(KERN_WARNING ++ "thinkpad_ec: no ThinkPad embedded controller!\n"); ++ return -ENODEV; ++ } ++ ++ if (request_region(TPC_BASE_PORT, TPC_NUM_PORTS, "thinkpad_ec")) { ++ reserved_io = 1; ++ } else { ++ printk(KERN_ERR "thinkpad_ec: cannot claim IO ports %#x-%#x... ", ++ TPC_BASE_PORT, ++ TPC_BASE_PORT + TPC_NUM_PORTS - 1); ++ if (force_io) { ++ printk("forcing use of unreserved IO ports.\n"); ++ } else { ++ printk("consider using force_io=1.\n"); ++ return -ENXIO; ++ } ++ } ++ prefetch_jiffies = TPC_PREFETCH_JUNK; ++ if (thinkpad_ec_test()) { ++ printk(KERN_ERR "thinkpad_ec: initial ec test failed\n"); ++ if (reserved_io) ++ release_region(TPC_BASE_PORT, TPC_NUM_PORTS); ++ return -ENXIO; ++ } ++ printk(KERN_INFO "thinkpad_ec: thinkpad_ec " TP_VERSION " loaded.\n"); ++ return 0; ++} ++ ++static void __exit thinkpad_ec_exit(void) ++{ ++ if (reserved_io) ++ release_region(TPC_BASE_PORT, TPC_NUM_PORTS); ++ printk(KERN_INFO "thinkpad_ec: unloaded.\n"); ++} ++ ++module_init(thinkpad_ec_init); ++module_exit(thinkpad_ec_exit); +diff --git a/drivers/platform/x86/tp_smapi.c b/drivers/platform/x86/tp_smapi.c +new file mode 100644 +index 000000000000..209cb6487e24 +--- /dev/null ++++ b/drivers/platform/x86/tp_smapi.c +@@ -0,0 +1,1493 @@ ++/* ++ * tp_smapi.c - ThinkPad SMAPI support ++ * ++ * This driver exposes some features of the System Management Application ++ * Program Interface (SMAPI) BIOS found on ThinkPad laptops. It works on ++ * models in which the SMAPI BIOS runs in SMM and is invoked by writing ++ * to the APM control port 0xB2. ++ * It also exposes battery status information, obtained from the ThinkPad ++ * embedded controller (via the thinkpad_ec module). ++ * Ancient ThinkPad models use a different interface, supported by the ++ * "thinkpad" module from "tpctl". ++ * ++ * Many of the battery status values obtained from the EC simply mirror ++ * values provided by the battery's Smart Battery System (SBS) interface, so ++ * their meaning is defined by the Smart Battery Data Specification (see ++ * http://sbs-forum.org/specs/sbdat110.pdf). References to this SBS spec ++ * are given in the code where relevant. ++ * ++ * Copyright (C) 2006 Shem Multinymous . ++ * SMAPI access code based on the mwave driver by Mike Sullivan. ++ * ++ * This program is free software; you can redistribute it and/or modify ++ * it under the terms of the GNU General Public License as published by ++ * the Free Software Foundation; either version 2 of the License, or ++ * (at your option) any later version. ++ * ++ * This program is distributed in the hope that it will be useful, ++ * but WITHOUT ANY WARRANTY; without even the implied warranty of ++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ++ * GNU General Public License for more details. ++ * ++ * You should have received a copy of the GNU General Public License ++ * along with this program; if not, write to the Free Software ++ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA ++ */ ++ ++#include ++#include ++#include ++#include ++#include ++#include /* CMOS defines */ ++#include ++#include ++#include ++#include ++#include ++#include ++ ++#define TP_VERSION "0.42" ++#define TP_DESC "ThinkPad SMAPI Support" ++#define TP_DIR "smapi" ++ ++MODULE_AUTHOR("Shem Multinymous"); ++MODULE_DESCRIPTION(TP_DESC); ++MODULE_VERSION(TP_VERSION); ++MODULE_LICENSE("GPL"); ++ ++static struct platform_device *pdev; ++ ++static int tp_debug; ++module_param_named(debug, tp_debug, int, 0600); ++MODULE_PARM_DESC(debug, "Debug level (0=off, 1=on)"); ++ ++/* A few macros for printk()ing: */ ++#define TPRINTK(level, fmt, args...) \ ++ dev_printk(level, &(pdev->dev), "%s: " fmt "\n", __func__, ## args) ++#define DPRINTK(fmt, args...) \ ++ do { if (tp_debug) TPRINTK(KERN_DEBUG, fmt, ## args); } while (0) ++ ++/********************************************************************* ++ * SMAPI interface ++ */ ++ ++/* SMAPI functions (register BX when making the SMM call). */ ++#define SMAPI_GET_INHIBIT_CHARGE 0x2114 ++#define SMAPI_SET_INHIBIT_CHARGE 0x2115 ++#define SMAPI_GET_THRESH_START 0x2116 ++#define SMAPI_SET_THRESH_START 0x2117 ++#define SMAPI_GET_FORCE_DISCHARGE 0x2118 ++#define SMAPI_SET_FORCE_DISCHARGE 0x2119 ++#define SMAPI_GET_THRESH_STOP 0x211a ++#define SMAPI_SET_THRESH_STOP 0x211b ++ ++/* SMAPI error codes (see ThinkPad 770 Technical Reference Manual p.83 at ++ http://www-307.ibm.com/pc/support/site.wss/document.do?lndocid=PFAN-3TUQQD */ ++#define SMAPI_RETCODE_EOF 0xff ++static struct { u8 rc; char *msg; int ret; } smapi_retcode[] = ++{ ++ {0x00, "OK", 0}, ++ {0x53, "SMAPI function is not available", -ENXIO}, ++ {0x81, "Invalid parameter", -EINVAL}, ++ {0x86, "Function is not supported by SMAPI BIOS", -EOPNOTSUPP}, ++ {0x90, "System error", -EIO}, ++ {0x91, "System is invalid", -EIO}, ++ {0x92, "System is busy, -EBUSY"}, ++ {0xa0, "Device error (disk read error)", -EIO}, ++ {0xa1, "Device is busy", -EBUSY}, ++ {0xa2, "Device is not attached", -ENXIO}, ++ {0xa3, "Device is disbled", -EIO}, ++ {0xa4, "Request parameter is out of range", -EINVAL}, ++ {0xa5, "Request parameter is not accepted", -EINVAL}, ++ {0xa6, "Transient error", -EBUSY}, /* ? */ ++ {SMAPI_RETCODE_EOF, "Unknown error code", -EIO} ++}; ++ ++ ++#define SMAPI_MAX_RETRIES 10 ++#define SMAPI_PORT2 0x4F /* fixed port, meaning unclear */ ++static unsigned short smapi_port; /* APM control port, normally 0xB2 */ ++ ++#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,37) ++static DECLARE_MUTEX(smapi_mutex); ++#else ++static DEFINE_SEMAPHORE(smapi_mutex); ++#endif ++ ++/** ++ * find_smapi_port - read SMAPI port from NVRAM ++ */ ++static int __init find_smapi_port(void) ++{ ++ u16 smapi_id = 0; ++ unsigned short port = 0; ++ unsigned long flags; ++ ++ spin_lock_irqsave(&rtc_lock, flags); ++ smapi_id = CMOS_READ(0x7C); ++ smapi_id |= (CMOS_READ(0x7D) << 8); ++ spin_unlock_irqrestore(&rtc_lock, flags); ++ ++ if (smapi_id != 0x5349) { ++ printk(KERN_ERR "SMAPI not supported (ID=0x%x)\n", smapi_id); ++ return -ENXIO; ++ } ++ spin_lock_irqsave(&rtc_lock, flags); ++ port = CMOS_READ(0x7E); ++ port |= (CMOS_READ(0x7F) << 8); ++ spin_unlock_irqrestore(&rtc_lock, flags); ++ if (port == 0) { ++ printk(KERN_ERR "unable to read SMAPI port number\n"); ++ return -ENXIO; ++ } ++ return port; ++} ++ ++/** ++ * smapi_request - make a SMAPI call ++ * @inEBX, @inECX, @inEDI, @inESI: input registers ++ * @outEBX, @outECX, @outEDX, @outEDI, @outESI: outputs registers ++ * @msg: textual error message ++ * Invokes the SMAPI SMBIOS with the given input and outpu args. ++ * All outputs are optional (can be %NULL). ++ * Returns 0 when successful, and a negative errno constant ++ * (see smapi_retcode above) upon failure. ++ */ ++static int smapi_request(u32 inEBX, u32 inECX, ++ u32 inEDI, u32 inESI, ++ u32 *outEBX, u32 *outECX, u32 *outEDX, ++ u32 *outEDI, u32 *outESI, const char **msg) ++{ ++ int ret = 0; ++ int i; ++ int retries; ++ u8 rc; ++ /* Must use local vars for output regs, due to reg pressure. */ ++ u32 tmpEAX, tmpEBX, tmpECX, tmpEDX, tmpEDI, tmpESI; ++ ++ for (retries = 0; retries < SMAPI_MAX_RETRIES; ++retries) { ++ DPRINTK("req_in: BX=%x CX=%x DI=%x SI=%x", ++ inEBX, inECX, inEDI, inESI); ++ ++ /* SMAPI's SMBIOS call and thinkpad_ec end up using use ++ * different interfaces to the same chip, so play it safe. */ ++ ret = thinkpad_ec_lock(); ++ if (ret) ++ return ret; ++ ++ __asm__ __volatile__( ++ "movl $0x00005380,%%eax\n\t" ++ "movl %6,%%ebx\n\t" ++ "movl %7,%%ecx\n\t" ++ "movl %8,%%edi\n\t" ++ "movl %9,%%esi\n\t" ++ "xorl %%edx,%%edx\n\t" ++ "movw %10,%%dx\n\t" ++ "out %%al,%%dx\n\t" /* trigger SMI to SMBIOS */ ++ "out %%al,$0x4F\n\t" ++ "movl %%eax,%0\n\t" ++ "movl %%ebx,%1\n\t" ++ "movl %%ecx,%2\n\t" ++ "movl %%edx,%3\n\t" ++ "movl %%edi,%4\n\t" ++ "movl %%esi,%5\n\t" ++ :"=m"(tmpEAX), ++ "=m"(tmpEBX), ++ "=m"(tmpECX), ++ "=m"(tmpEDX), ++ "=m"(tmpEDI), ++ "=m"(tmpESI) ++ :"m"(inEBX), "m"(inECX), "m"(inEDI), "m"(inESI), ++ "m"((u16)smapi_port) ++ :"%eax", "%ebx", "%ecx", "%edx", "%edi", ++ "%esi"); ++ ++ thinkpad_ec_invalidate(); ++ thinkpad_ec_unlock(); ++ ++ /* Don't let the next SMAPI access happen too quickly, ++ * may case problems. (We're hold smapi_mutex). */ ++ msleep(50); ++ ++ if (outEBX) *outEBX = tmpEBX; ++ if (outECX) *outECX = tmpECX; ++ if (outEDX) *outEDX = tmpEDX; ++ if (outESI) *outESI = tmpESI; ++ if (outEDI) *outEDI = tmpEDI; ++ ++ /* Look up error code */ ++ rc = (tmpEAX>>8)&0xFF; ++ for (i = 0; smapi_retcode[i].rc != SMAPI_RETCODE_EOF && ++ smapi_retcode[i].rc != rc; ++i) {} ++ ret = smapi_retcode[i].ret; ++ if (msg) ++ *msg = smapi_retcode[i].msg; ++ ++ DPRINTK("req_out: AX=%x BX=%x CX=%x DX=%x DI=%x SI=%x r=%d", ++ tmpEAX, tmpEBX, tmpECX, tmpEDX, tmpEDI, tmpESI, ret); ++ if (ret) ++ TPRINTK(KERN_NOTICE, "SMAPI error: %s (func=%x)", ++ smapi_retcode[i].msg, inEBX); ++ ++ if (ret != -EBUSY) ++ return ret; ++ } ++ return ret; ++} ++ ++/* Convenience wrapper: discard output arguments */ ++static int smapi_write(u32 inEBX, u32 inECX, ++ u32 inEDI, u32 inESI, const char **msg) ++{ ++ return smapi_request(inEBX, inECX, inEDI, inESI, ++ NULL, NULL, NULL, NULL, NULL, msg); ++} ++ ++ ++/********************************************************************* ++ * Specific SMAPI services ++ * All of these functions return 0 upon success, and a negative errno ++ * constant (see smapi_retcode) on failure. ++ */ ++ ++enum thresh_type { ++ THRESH_STOP = 0, /* the code assumes this is 0 for brevity */ ++ THRESH_START ++}; ++#define THRESH_NAME(which) ((which == THRESH_START) ? "start" : "stop") ++ ++/** ++ * __get_real_thresh - read battery charge start/stop threshold from SMAPI ++ * @bat: battery number (0 or 1) ++ * @which: THRESH_START or THRESH_STOP ++ * @thresh: 1..99, 0=default 1..99, 0=default (pass this as-is to SMAPI) ++ * @outEDI: some additional state that needs to be preserved, meaning unknown ++ * @outESI: some additional state that needs to be preserved, meaning unknown ++ */ ++static int __get_real_thresh(int bat, enum thresh_type which, int *thresh, ++ u32 *outEDI, u32 *outESI) ++{ ++ u32 ebx = (which == THRESH_START) ? SMAPI_GET_THRESH_START ++ : SMAPI_GET_THRESH_STOP; ++ u32 ecx = (bat+1)<<8; ++ const char *msg; ++ int ret = smapi_request(ebx, ecx, 0, 0, NULL, ++ &ecx, NULL, outEDI, outESI, &msg); ++ if (ret) { ++ TPRINTK(KERN_NOTICE, "cannot get %s_thresh of bat=%d: %s", ++ THRESH_NAME(which), bat, msg); ++ return ret; ++ } ++ if (!(ecx&0x00000100)) { ++ TPRINTK(KERN_NOTICE, "cannot get %s_thresh of bat=%d: ecx=0%x", ++ THRESH_NAME(which), bat, ecx); ++ return -EIO; ++ } ++ if (thresh) ++ *thresh = ecx&0xFF; ++ return 0; ++} ++ ++/** ++ * get_real_thresh - read battery charge start/stop threshold from SMAPI ++ * @bat: battery number (0 or 1) ++ * @which: THRESH_START or THRESH_STOP ++ * @thresh: 1..99, 0=default (passes as-is to SMAPI) ++ */ ++static int get_real_thresh(int bat, enum thresh_type which, int *thresh) ++{ ++ return __get_real_thresh(bat, which, thresh, NULL, NULL); ++} ++ ++/** ++ * set_real_thresh - write battery start/top charge threshold to SMAPI ++ * @bat: battery number (0 or 1) ++ * @which: THRESH_START or THRESH_STOP ++ * @thresh: 1..99, 0=default (passes as-is to SMAPI) ++ */ ++static int set_real_thresh(int bat, enum thresh_type which, int thresh) ++{ ++ u32 ebx = (which == THRESH_START) ? SMAPI_SET_THRESH_START ++ : SMAPI_SET_THRESH_STOP; ++ u32 ecx = ((bat+1)<<8) + thresh; ++ u32 getDI, getSI; ++ const char *msg; ++ int ret; ++ ++ /* verify read before writing */ ++ ret = __get_real_thresh(bat, which, NULL, &getDI, &getSI); ++ if (ret) ++ return ret; ++ ++ ret = smapi_write(ebx, ecx, getDI, getSI, &msg); ++ if (ret) ++ TPRINTK(KERN_NOTICE, "set %s to %d for bat=%d failed: %s", ++ THRESH_NAME(which), thresh, bat, msg); ++ else ++ TPRINTK(KERN_INFO, "set %s to %d for bat=%d", ++ THRESH_NAME(which), thresh, bat); ++ return ret; ++} ++ ++/** ++ * __get_inhibit_charge_minutes - get inhibit charge period from SMAPI ++ * @bat: battery number (0 or 1) ++ * @minutes: period in minutes (1..65535 minutes, 0=disabled) ++ * @outECX: some additional state that needs to be preserved, meaning unknown ++ * Note that @minutes is the originally set value, it does not count down. ++ */ ++static int __get_inhibit_charge_minutes(int bat, int *minutes, u32 *outECX) ++{ ++ u32 ecx = (bat+1)<<8; ++ u32 esi; ++ const char *msg; ++ int ret = smapi_request(SMAPI_GET_INHIBIT_CHARGE, ecx, 0, 0, ++ NULL, &ecx, NULL, NULL, &esi, &msg); ++ if (ret) { ++ TPRINTK(KERN_NOTICE, "failed for bat=%d: %s", bat, msg); ++ return ret; ++ } ++ if (!(ecx&0x0100)) { ++ TPRINTK(KERN_NOTICE, "bad ecx=0x%x for bat=%d", ecx, bat); ++ return -EIO; ++ } ++ if (minutes) ++ *minutes = (ecx&0x0001)?esi:0; ++ if (outECX) ++ *outECX = ecx; ++ return 0; ++} ++ ++/** ++ * get_inhibit_charge_minutes - get inhibit charge period from SMAPI ++ * @bat: battery number (0 or 1) ++ * @minutes: period in minutes (1..65535 minutes, 0=disabled) ++ * Note that @minutes is the originally set value, it does not count down. ++ */ ++static int get_inhibit_charge_minutes(int bat, int *minutes) ++{ ++ return __get_inhibit_charge_minutes(bat, minutes, NULL); ++} ++ ++/** ++ * set_inhibit_charge_minutes - write inhibit charge period to SMAPI ++ * @bat: battery number (0 or 1) ++ * @minutes: period in minutes (1..65535 minutes, 0=disabled) ++ */ ++static int set_inhibit_charge_minutes(int bat, int minutes) ++{ ++ u32 ecx; ++ const char *msg; ++ int ret; ++ ++ /* verify read before writing */ ++ ret = __get_inhibit_charge_minutes(bat, NULL, &ecx); ++ if (ret) ++ return ret; ++ ++ ecx = ((bat+1)<<8) | (ecx&0x00FE) | (minutes > 0 ? 0x0001 : 0x0000); ++ if (minutes > 0xFFFF) ++ minutes = 0xFFFF; ++ ret = smapi_write(SMAPI_SET_INHIBIT_CHARGE, ecx, 0, minutes, &msg); ++ if (ret) ++ TPRINTK(KERN_NOTICE, ++ "set to %d failed for bat=%d: %s", minutes, bat, msg); ++ else ++ TPRINTK(KERN_INFO, "set to %d for bat=%d\n", minutes, bat); ++ return ret; ++} ++ ++ ++/** ++ * get_force_discharge - get status of forced discharging from SMAPI ++ * @bat: battery number (0 or 1) ++ * @enabled: 1 if forced discharged is enabled, 0 if not ++ */ ++static int get_force_discharge(int bat, int *enabled) ++{ ++ u32 ecx = (bat+1)<<8; ++ const char *msg; ++ int ret = smapi_request(SMAPI_GET_FORCE_DISCHARGE, ecx, 0, 0, ++ NULL, &ecx, NULL, NULL, NULL, &msg); ++ if (ret) { ++ TPRINTK(KERN_NOTICE, "failed for bat=%d: %s", bat, msg); ++ return ret; ++ } ++ *enabled = (!(ecx&0x00000100) && (ecx&0x00000001))?1:0; ++ return 0; ++} ++ ++/** ++ * set_force_discharge - write status of forced discharging to SMAPI ++ * @bat: battery number (0 or 1) ++ * @enabled: 1 if forced discharged is enabled, 0 if not ++ */ ++static int set_force_discharge(int bat, int enabled) ++{ ++ u32 ecx = (bat+1)<<8; ++ const char *msg; ++ int ret = smapi_request(SMAPI_GET_FORCE_DISCHARGE, ecx, 0, 0, ++ NULL, &ecx, NULL, NULL, NULL, &msg); ++ if (ret) { ++ TPRINTK(KERN_NOTICE, "get failed for bat=%d: %s", bat, msg); ++ return ret; ++ } ++ if (ecx&0x00000100) { ++ TPRINTK(KERN_NOTICE, "cannot force discharge bat=%d", bat); ++ return -EIO; ++ } ++ ++ ecx = ((bat+1)<<8) | (ecx&0x000000FA) | (enabled?0x00000001:0); ++ ret = smapi_write(SMAPI_SET_FORCE_DISCHARGE, ecx, 0, 0, &msg); ++ if (ret) ++ TPRINTK(KERN_NOTICE, "set to %d failed for bat=%d: %s", ++ enabled, bat, msg); ++ else ++ TPRINTK(KERN_INFO, "set to %d for bat=%d", enabled, bat); ++ return ret; ++} ++ ++ ++/********************************************************************* ++ * Wrappers to threshold-related SMAPI functions, which handle default ++ * thresholds and related quirks. ++ */ ++ ++/* Minimum, default and minimum difference for battery charging thresholds: */ ++#define MIN_THRESH_DELTA 4 /* Min delta between start and stop thresh */ ++#define MIN_THRESH_START 2 ++#define MAX_THRESH_START (100-MIN_THRESH_DELTA) ++#define MIN_THRESH_STOP (MIN_THRESH_START + MIN_THRESH_DELTA) ++#define MAX_THRESH_STOP 100 ++#define DEFAULT_THRESH_START MAX_THRESH_START ++#define DEFAULT_THRESH_STOP MAX_THRESH_STOP ++ ++/* The GUI of IBM's Battery Maximizer seems to show a start threshold that ++ * is 1 more than the value we set/get via SMAPI. Since the threshold is ++ * maintained across reboot, this can be confusing. So we kludge our ++ * interface for interoperability: */ ++#define BATMAX_FIX 1 ++ ++/* Get charge start/stop threshold (1..100), ++ * substituting default values if needed and applying BATMAT_FIX. */ ++static int get_thresh(int bat, enum thresh_type which, int *thresh) ++{ ++ int ret = get_real_thresh(bat, which, thresh); ++ if (ret) ++ return ret; ++ if (*thresh == 0) ++ *thresh = (which == THRESH_START) ? DEFAULT_THRESH_START ++ : DEFAULT_THRESH_STOP; ++ else if (which == THRESH_START) ++ *thresh += BATMAX_FIX; ++ return 0; ++} ++ ++ ++/* Set charge start/stop threshold (1..100), ++ * substituting default values if needed and applying BATMAT_FIX. */ ++static int set_thresh(int bat, enum thresh_type which, int thresh) ++{ ++ if (which == THRESH_STOP && thresh == DEFAULT_THRESH_STOP) ++ thresh = 0; /* 100 is out of range, but default means 100 */ ++ if (which == THRESH_START) ++ thresh -= BATMAX_FIX; ++ return set_real_thresh(bat, which, thresh); ++} ++ ++/********************************************************************* ++ * ThinkPad embedded controller readout and basic functions ++ */ ++ ++/** ++ * read_tp_ec_row - read data row from the ThinkPad embedded controller ++ * @arg0: EC command code ++ * @bat: battery number, 0 or 1 ++ * @j: the byte value to be used for "junk" (unused) input/outputs ++ * @dataval: result vector ++ */ ++static int read_tp_ec_row(u8 arg0, int bat, u8 j, u8 *dataval) ++{ ++ int ret; ++ const struct thinkpad_ec_row args = { .mask = 0xFFFF, ++ .val = {arg0, j,j,j,j,j,j,j,j,j,j,j,j,j,j, (u8)bat} }; ++ struct thinkpad_ec_row data = { .mask = 0xFFFF }; ++ ++ ret = thinkpad_ec_lock(); ++ if (ret) ++ return ret; ++ ret = thinkpad_ec_read_row(&args, &data); ++ thinkpad_ec_unlock(); ++ memcpy(dataval, &data.val, TP_CONTROLLER_ROW_LEN); ++ return ret; ++} ++ ++/** ++ * power_device_present - check for presence of battery or AC power ++ * @bat: 0 for battery 0, 1 for battery 1, otherwise AC power ++ * Returns 1 if present, 0 if not present, negative if error. ++ */ ++static int power_device_present(int bat) ++{ ++ u8 row[TP_CONTROLLER_ROW_LEN]; ++ u8 test; ++ int ret = read_tp_ec_row(1, bat, 0, row); ++ if (ret) ++ return ret; ++ switch (bat) { ++ case 0: test = 0x40; break; /* battery 0 */ ++ case 1: test = 0x20; break; /* battery 1 */ ++ default: test = 0x80; /* AC power */ ++ } ++ return (row[0] & test) ? 1 : 0; ++} ++ ++/** ++ * bat_has_status - check if battery can report detailed status ++ * @bat: 0 for battery 0, 1 for battery 1 ++ * Returns 1 if yes, 0 if no, negative if error. ++ */ ++static int bat_has_status(int bat) ++{ ++ u8 row[TP_CONTROLLER_ROW_LEN]; ++ int ret = read_tp_ec_row(1, bat, 0, row); ++ if (ret) ++ return ret; ++ if ((row[0] & (bat?0x20:0x40)) == 0) /* no battery */ ++ return 0; ++ if ((row[1] & (0x60)) == 0) /* no status */ ++ return 0; ++ return 1; ++} ++ ++/** ++ * get_tp_ec_bat_16 - read a 16-bit value from EC battery status data ++ * @arg0: first argument to EC ++ * @off: offset in row returned from EC ++ * @bat: battery (0 or 1) ++ * @val: the 16-bit value obtained ++ * Returns nonzero on error. ++ */ ++static int get_tp_ec_bat_16(u8 arg0, int offset, int bat, u16 *val) ++{ ++ u8 row[TP_CONTROLLER_ROW_LEN]; ++ int ret; ++ if (bat_has_status(bat) != 1) ++ return -ENXIO; ++ ret = read_tp_ec_row(arg0, bat, 0, row); ++ if (ret) ++ return ret; ++ *val = *(u16 *)(row+offset); ++ return 0; ++} ++ ++/********************************************************************* ++ * sysfs attributes for batteries - ++ * definitions and helper functions ++ */ ++ ++/* A custom device attribute struct which holds a battery number */ ++struct bat_device_attribute { ++ struct device_attribute dev_attr; ++ int bat; ++}; ++ ++/** ++ * attr_get_bat - get the battery to which the attribute belongs ++ */ ++static int attr_get_bat(struct device_attribute *attr) ++{ ++ return container_of(attr, struct bat_device_attribute, dev_attr)->bat; ++} ++ ++/** ++ * show_tp_ec_bat_u16 - show an unsigned 16-bit battery attribute ++ * @arg0: specified 1st argument of EC raw to read ++ * @offset: byte offset in EC raw data ++ * @mul: correction factor to multiply by ++ * @na_msg: string to output is value not available (0xFFFFFFFF) ++ * @attr: battery attribute ++ * @buf: output buffer ++ * The 16-bit value is read from the EC, treated as unsigned, ++ * transformed as x->mul*x, and printed to the buffer. ++ * If the value is 0xFFFFFFFF and na_msg!=%NULL, na_msg is printed instead. ++ */ ++static ssize_t show_tp_ec_bat_u16(u8 arg0, int offset, int mul, ++ const char *na_msg, ++ struct device_attribute *attr, char *buf) ++{ ++ u16 val; ++ int ret = get_tp_ec_bat_16(arg0, offset, attr_get_bat(attr), &val); ++ if (ret) ++ return ret; ++ if (na_msg && val == 0xFFFF) ++ return sprintf(buf, "%s\n", na_msg); ++ else ++ return sprintf(buf, "%u\n", mul*(unsigned int)val); ++} ++ ++/** ++ * show_tp_ec_bat_s16 - show an signed 16-bit battery attribute ++ * @arg0: specified 1st argument of EC raw to read ++ * @offset: byte offset in EC raw data ++ * @mul: correction factor to multiply by ++ * @add: correction term to add after multiplication ++ * @attr: battery attribute ++ * @buf: output buffer ++ * The 16-bit value is read from the EC, treated as signed, ++ * transformed as x->mul*x+add, and printed to the buffer. ++ */ ++static ssize_t show_tp_ec_bat_s16(u8 arg0, int offset, int mul, int add, ++ struct device_attribute *attr, char *buf) ++{ ++ u16 val; ++ int ret = get_tp_ec_bat_16(arg0, offset, attr_get_bat(attr), &val); ++ if (ret) ++ return ret; ++ return sprintf(buf, "%d\n", mul*(s16)val+add); ++} ++ ++/** ++ * show_tp_ec_bat_str - show a string from EC battery status data ++ * @arg0: specified 1st argument of EC raw to read ++ * @offset: byte offset in EC raw data ++ * @maxlen: maximum string length ++ * @attr: battery attribute ++ * @buf: output buffer ++ */ ++static ssize_t show_tp_ec_bat_str(u8 arg0, int offset, int maxlen, ++ struct device_attribute *attr, char *buf) ++{ ++ int bat = attr_get_bat(attr); ++ u8 row[TP_CONTROLLER_ROW_LEN]; ++ int ret; ++ if (bat_has_status(bat) != 1) ++ return -ENXIO; ++ ret = read_tp_ec_row(arg0, bat, 0, row); ++ if (ret) ++ return ret; ++ strncpy(buf, (char *)row+offset, maxlen); ++ buf[maxlen] = 0; ++ strcat(buf, "\n"); ++ return strlen(buf); ++} ++ ++/** ++ * show_tp_ec_bat_power - show a power readout from EC battery status data ++ * @arg0: specified 1st argument of EC raw to read ++ * @offV: byte offset of voltage in EC raw data ++ * @offI: byte offset of current in EC raw data ++ * @attr: battery attribute ++ * @buf: output buffer ++ * Computes the power as current*voltage from the two given readout offsets. ++ */ ++static ssize_t show_tp_ec_bat_power(u8 arg0, int offV, int offI, ++ struct device_attribute *attr, char *buf) ++{ ++ u8 row[TP_CONTROLLER_ROW_LEN]; ++ int milliamp, millivolt, ret; ++ int bat = attr_get_bat(attr); ++ if (bat_has_status(bat) != 1) ++ return -ENXIO; ++ ret = read_tp_ec_row(1, bat, 0, row); ++ if (ret) ++ return ret; ++ millivolt = *(u16 *)(row+offV); ++ milliamp = *(s16 *)(row+offI); ++ return sprintf(buf, "%d\n", milliamp*millivolt/1000); /* units: mW */ ++} ++ ++/** ++ * show_tp_ec_bat_date - decode and show a date from EC battery status data ++ * @arg0: specified 1st argument of EC raw to read ++ * @offset: byte offset in EC raw data ++ * @attr: battery attribute ++ * @buf: output buffer ++ */ ++static ssize_t show_tp_ec_bat_date(u8 arg0, int offset, ++ struct device_attribute *attr, char *buf) ++{ ++ u8 row[TP_CONTROLLER_ROW_LEN]; ++ u16 v; ++ int ret; ++ int day, month, year; ++ int bat = attr_get_bat(attr); ++ if (bat_has_status(bat) != 1) ++ return -ENXIO; ++ ret = read_tp_ec_row(arg0, bat, 0, row); ++ if (ret) ++ return ret; ++ ++ /* Decode bit-packed: v = day | (month<<5) | ((year-1980)<<9) */ ++ v = *(u16 *)(row+offset); ++ day = v & 0x1F; ++ month = (v >> 5) & 0xF; ++ year = (v >> 9) + 1980; ++ ++ return sprintf(buf, "%04d-%02d-%02d\n", year, month, day); ++} ++ ++ ++/********************************************************************* ++ * sysfs attribute I/O for batteries - ++ * the actual attribute show/store functions ++ */ ++ ++static ssize_t show_battery_start_charge_thresh(struct device *dev, ++ struct device_attribute *attr, char *buf) ++{ ++ int thresh; ++ int bat = attr_get_bat(attr); ++ int ret = get_thresh(bat, THRESH_START, &thresh); ++ if (ret) ++ return ret; ++ return sprintf(buf, "%d\n", thresh); /* units: percent */ ++} ++ ++static ssize_t show_battery_stop_charge_thresh(struct device *dev, ++ struct device_attribute *attr, char *buf) ++{ ++ int thresh; ++ int bat = attr_get_bat(attr); ++ int ret = get_thresh(bat, THRESH_STOP, &thresh); ++ if (ret) ++ return ret; ++ return sprintf(buf, "%d\n", thresh); /* units: percent */ ++} ++ ++/** ++ * store_battery_start_charge_thresh - store battery_start_charge_thresh attr ++ * Since this is a kernel<->user interface, we ensure a valid state for ++ * the hardware. We do this by clamping the requested threshold to the ++ * valid range and, if necessary, moving the other threshold so that ++ * it's MIN_THRESH_DELTA away from this one. ++ */ ++static ssize_t store_battery_start_charge_thresh(struct device *dev, ++ struct device_attribute *attr, const char *buf, size_t count) ++{ ++ int thresh, other_thresh, ret; ++ int bat = attr_get_bat(attr); ++ ++ if (sscanf(buf, "%d", &thresh) != 1 || thresh < 1 || thresh > 100) ++ return -EINVAL; ++ ++ if (thresh < MIN_THRESH_START) /* clamp up to MIN_THRESH_START */ ++ thresh = MIN_THRESH_START; ++ if (thresh > MAX_THRESH_START) /* clamp down to MAX_THRESH_START */ ++ thresh = MAX_THRESH_START; ++ ++ down(&smapi_mutex); ++ ret = get_thresh(bat, THRESH_STOP, &other_thresh); ++ if (ret != -EOPNOTSUPP && ret != -ENXIO) { ++ if (ret) /* other threshold is set? */ ++ goto out; ++ ret = get_real_thresh(bat, THRESH_START, NULL); ++ if (ret) /* this threshold is set? */ ++ goto out; ++ if (other_thresh < thresh+MIN_THRESH_DELTA) { ++ /* move other thresh to keep it above this one */ ++ ret = set_thresh(bat, THRESH_STOP, ++ thresh+MIN_THRESH_DELTA); ++ if (ret) ++ goto out; ++ } ++ } ++ ret = set_thresh(bat, THRESH_START, thresh); ++out: ++ up(&smapi_mutex); ++ return count; ++ ++} ++ ++/** ++ * store_battery_stop_charge_thresh - store battery_stop_charge_thresh attr ++ * Since this is a kernel<->user interface, we ensure a valid state for ++ * the hardware. We do this by clamping the requested threshold to the ++ * valid range and, if necessary, moving the other threshold so that ++ * it's MIN_THRESH_DELTA away from this one. ++ */ ++static ssize_t store_battery_stop_charge_thresh(struct device *dev, ++ struct device_attribute *attr, const char *buf, size_t count) ++{ ++ int thresh, other_thresh, ret; ++ int bat = attr_get_bat(attr); ++ ++ if (sscanf(buf, "%d", &thresh) != 1 || thresh < 1 || thresh > 100) ++ return -EINVAL; ++ ++ if (thresh < MIN_THRESH_STOP) /* clamp up to MIN_THRESH_STOP */ ++ thresh = MIN_THRESH_STOP; ++ ++ down(&smapi_mutex); ++ ret = get_thresh(bat, THRESH_START, &other_thresh); ++ if (ret != -EOPNOTSUPP && ret != -ENXIO) { /* other threshold exists? */ ++ if (ret) ++ goto out; ++ /* this threshold exists? */ ++ ret = get_real_thresh(bat, THRESH_STOP, NULL); ++ if (ret) ++ goto out; ++ if (other_thresh >= thresh-MIN_THRESH_DELTA) { ++ /* move other thresh to be below this one */ ++ ret = set_thresh(bat, THRESH_START, ++ thresh-MIN_THRESH_DELTA); ++ if (ret) ++ goto out; ++ } ++ } ++ ret = set_thresh(bat, THRESH_STOP, thresh); ++out: ++ up(&smapi_mutex); ++ return count; ++} ++ ++static ssize_t show_battery_inhibit_charge_minutes(struct device *dev, ++ struct device_attribute *attr, char *buf) ++{ ++ int minutes; ++ int bat = attr_get_bat(attr); ++ int ret = get_inhibit_charge_minutes(bat, &minutes); ++ if (ret) ++ return ret; ++ return sprintf(buf, "%d\n", minutes); /* units: minutes */ ++} ++ ++static ssize_t store_battery_inhibit_charge_minutes(struct device *dev, ++ struct device_attribute *attr, ++ const char *buf, size_t count) ++{ ++ int ret; ++ int minutes; ++ int bat = attr_get_bat(attr); ++ if (sscanf(buf, "%d", &minutes) != 1 || minutes < 0) { ++ TPRINTK(KERN_ERR, "inhibit_charge_minutes: " ++ "must be a non-negative integer"); ++ return -EINVAL; ++ } ++ ret = set_inhibit_charge_minutes(bat, minutes); ++ if (ret) ++ return ret; ++ return count; ++} ++ ++static ssize_t show_battery_force_discharge(struct device *dev, ++ struct device_attribute *attr, char *buf) ++{ ++ int enabled; ++ int bat = attr_get_bat(attr); ++ int ret = get_force_discharge(bat, &enabled); ++ if (ret) ++ return ret; ++ return sprintf(buf, "%d\n", enabled); /* type: boolean */ ++} ++ ++static ssize_t store_battery_force_discharge(struct device *dev, ++ struct device_attribute *attr, const char *buf, size_t count) ++{ ++ int ret; ++ int enabled; ++ int bat = attr_get_bat(attr); ++ if (sscanf(buf, "%d", &enabled) != 1 || enabled < 0 || enabled > 1) ++ return -EINVAL; ++ ret = set_force_discharge(bat, enabled); ++ if (ret) ++ return ret; ++ return count; ++} ++ ++static ssize_t show_battery_installed( ++ struct device *dev, struct device_attribute *attr, char *buf) ++{ ++ int bat = attr_get_bat(attr); ++ int ret = power_device_present(bat); ++ if (ret < 0) ++ return ret; ++ return sprintf(buf, "%d\n", ret); /* type: boolean */ ++} ++ ++static ssize_t show_battery_state( ++ struct device *dev, struct device_attribute *attr, char *buf) ++{ ++ u8 row[TP_CONTROLLER_ROW_LEN]; ++ const char *txt; ++ int ret; ++ int bat = attr_get_bat(attr); ++ if (bat_has_status(bat) != 1) ++ return sprintf(buf, "none\n"); ++ ret = read_tp_ec_row(1, bat, 0, row); ++ if (ret) ++ return ret; ++ switch (row[1] & 0xf0) { ++ case 0xc0: txt = "idle"; break; ++ case 0xd0: txt = "discharging"; break; ++ case 0xe0: txt = "charging"; break; ++ default: return sprintf(buf, "unknown (0x%x)\n", row[1]); ++ } ++ return sprintf(buf, "%s\n", txt); /* type: string from fixed set */ ++} ++ ++static ssize_t show_battery_manufacturer( ++ struct device *dev, struct device_attribute *attr, char *buf) ++{ ++ /* type: string. SBS spec v1.1 p34: ManufacturerName() */ ++ return show_tp_ec_bat_str(4, 2, TP_CONTROLLER_ROW_LEN-2, attr, buf); ++} ++ ++static ssize_t show_battery_model( ++ struct device *dev, struct device_attribute *attr, char *buf) ++{ ++ /* type: string. SBS spec v1.1 p34: DeviceName() */ ++ return show_tp_ec_bat_str(5, 2, TP_CONTROLLER_ROW_LEN-2, attr, buf); ++} ++ ++static ssize_t show_battery_barcoding( ++ struct device *dev, struct device_attribute *attr, char *buf) ++{ ++ /* type: string */ ++ return show_tp_ec_bat_str(7, 2, TP_CONTROLLER_ROW_LEN-2, attr, buf); ++} ++ ++static ssize_t show_battery_chemistry( ++ struct device *dev, struct device_attribute *attr, char *buf) ++{ ++ /* type: string. SBS spec v1.1 p34-35: DeviceChemistry() */ ++ return show_tp_ec_bat_str(6, 2, 5, attr, buf); ++} ++ ++static ssize_t show_battery_voltage( ++ struct device *dev, struct device_attribute *attr, char *buf) ++{ ++ /* units: mV. SBS spec v1.1 p24: Voltage() */ ++ return show_tp_ec_bat_u16(1, 6, 1, NULL, attr, buf); ++} ++ ++static ssize_t show_battery_design_voltage( ++ struct device *dev, struct device_attribute *attr, char *buf) ++{ ++ /* units: mV. SBS spec v1.1 p32: DesignVoltage() */ ++ return show_tp_ec_bat_u16(3, 4, 1, NULL, attr, buf); ++} ++ ++static ssize_t show_battery_charging_max_voltage( ++ struct device *dev, struct device_attribute *attr, char *buf) ++{ ++ /* units: mV. SBS spec v1.1 p37,39: ChargingVoltage() */ ++ return show_tp_ec_bat_u16(9, 8, 1, NULL, attr, buf); ++} ++ ++static ssize_t show_battery_group0_voltage( ++ struct device *dev, struct device_attribute *attr, char *buf) ++{ ++ /* units: mV */ ++ return show_tp_ec_bat_u16(0xA, 12, 1, NULL, attr, buf); ++} ++ ++static ssize_t show_battery_group1_voltage( ++ struct device *dev, struct device_attribute *attr, char *buf) ++{ ++ /* units: mV */ ++ return show_tp_ec_bat_u16(0xA, 10, 1, NULL, attr, buf); ++} ++ ++static ssize_t show_battery_group2_voltage( ++ struct device *dev, struct device_attribute *attr, char *buf) ++{ ++ /* units: mV */ ++ return show_tp_ec_bat_u16(0xA, 8, 1, NULL, attr, buf); ++} ++ ++static ssize_t show_battery_group3_voltage( ++ struct device *dev, struct device_attribute *attr, char *buf) ++{ ++ /* units: mV */ ++ return show_tp_ec_bat_u16(0xA, 6, 1, NULL, attr, buf); ++} ++ ++static ssize_t show_battery_current_now( ++ struct device *dev, struct device_attribute *attr, char *buf) ++{ ++ /* units: mA. SBS spec v1.1 p24: Current() */ ++ return show_tp_ec_bat_s16(1, 8, 1, 0, attr, buf); ++} ++ ++static ssize_t show_battery_current_avg( ++ struct device *dev, struct device_attribute *attr, char *buf) ++{ ++ /* units: mA. SBS spec v1.1 p24: AverageCurrent() */ ++ return show_tp_ec_bat_s16(1, 10, 1, 0, attr, buf); ++} ++ ++static ssize_t show_battery_charging_max_current( ++ struct device *dev, struct device_attribute *attr, char *buf) ++{ ++ /* units: mA. SBS spec v1.1 p36,38: ChargingCurrent() */ ++ return show_tp_ec_bat_s16(9, 6, 1, 0, attr, buf); ++} ++ ++static ssize_t show_battery_power_now( ++ struct device *dev, struct device_attribute *attr, char *buf) ++{ ++ /* units: mW. SBS spec v1.1: Voltage()*Current() */ ++ return show_tp_ec_bat_power(1, 6, 8, attr, buf); ++} ++ ++static ssize_t show_battery_power_avg( ++ struct device *dev, struct device_attribute *attr, char *buf) ++{ ++ /* units: mW. SBS spec v1.1: Voltage()*AverageCurrent() */ ++ return show_tp_ec_bat_power(1, 6, 10, attr, buf); ++} ++ ++static ssize_t show_battery_remaining_percent( ++ struct device *dev, struct device_attribute *attr, char *buf) ++{ ++ /* units: percent. SBS spec v1.1 p25: RelativeStateOfCharge() */ ++ return show_tp_ec_bat_u16(1, 12, 1, NULL, attr, buf); ++} ++ ++static ssize_t show_battery_remaining_percent_error( ++ struct device *dev, struct device_attribute *attr, char *buf) ++{ ++ /* units: percent. SBS spec v1.1 p25: MaxError() */ ++ return show_tp_ec_bat_u16(9, 4, 1, NULL, attr, buf); ++} ++ ++static ssize_t show_battery_remaining_charging_time( ++ struct device *dev, struct device_attribute *attr, char *buf) ++{ ++ /* units: minutes. SBS spec v1.1 p27: AverageTimeToFull() */ ++ return show_tp_ec_bat_u16(2, 8, 1, "not_charging", attr, buf); ++} ++ ++static ssize_t show_battery_remaining_running_time( ++ struct device *dev, struct device_attribute *attr, char *buf) ++{ ++ /* units: minutes. SBS spec v1.1 p27: RunTimeToEmpty() */ ++ return show_tp_ec_bat_u16(2, 6, 1, "not_discharging", attr, buf); ++} ++ ++static ssize_t show_battery_remaining_running_time_now( ++ struct device *dev, struct device_attribute *attr, char *buf) ++{ ++ /* units: minutes. SBS spec v1.1 p27: RunTimeToEmpty() */ ++ return show_tp_ec_bat_u16(2, 4, 1, "not_discharging", attr, buf); ++} ++ ++static ssize_t show_battery_remaining_capacity( ++ struct device *dev, struct device_attribute *attr, char *buf) ++{ ++ /* units: mWh. SBS spec v1.1 p26. */ ++ return show_tp_ec_bat_u16(1, 14, 10, "", attr, buf); ++} ++ ++static ssize_t show_battery_last_full_capacity( ++ struct device *dev, struct device_attribute *attr, char *buf) ++{ ++ /* units: mWh. SBS spec v1.1 p26: FullChargeCapacity() */ ++ return show_tp_ec_bat_u16(2, 2, 10, "", attr, buf); ++} ++ ++static ssize_t show_battery_design_capacity( ++ struct device *dev, struct device_attribute *attr, char *buf) ++{ ++ /* units: mWh. SBS spec v1.1 p32: DesignCapacity() */ ++ return show_tp_ec_bat_u16(3, 2, 10, "", attr, buf); ++} ++ ++static ssize_t show_battery_cycle_count( ++ struct device *dev, struct device_attribute *attr, char *buf) ++{ ++ /* units: ordinal. SBS spec v1.1 p32: CycleCount() */ ++ return show_tp_ec_bat_u16(2, 12, 1, "", attr, buf); ++} ++ ++static ssize_t show_battery_temperature( ++ struct device *dev, struct device_attribute *attr, char *buf) ++{ ++ /* units: millicelsius. SBS spec v1.1: Temperature()*10 */ ++ return show_tp_ec_bat_s16(1, 4, 100, -273100, attr, buf); ++} ++ ++static ssize_t show_battery_serial( ++ struct device *dev, struct device_attribute *attr, char *buf) ++{ ++ /* type: int. SBS spec v1.1 p34: SerialNumber() */ ++ return show_tp_ec_bat_u16(3, 10, 1, "", attr, buf); ++} ++ ++static ssize_t show_battery_manufacture_date( ++ struct device *dev, struct device_attribute *attr, char *buf) ++{ ++ /* type: YYYY-MM-DD. SBS spec v1.1 p34: ManufactureDate() */ ++ return show_tp_ec_bat_date(3, 8, attr, buf); ++} ++ ++static ssize_t show_battery_first_use_date( ++ struct device *dev, struct device_attribute *attr, char *buf) ++{ ++ /* type: YYYY-MM-DD */ ++ return show_tp_ec_bat_date(8, 2, attr, buf); ++} ++ ++/** ++ * show_battery_dump - show the battery's dump attribute ++ * The dump attribute gives a hex dump of all EC readouts related to a ++ * battery. Some of the enumerated values don't really exist (i.e., the ++ * EC function just leaves them untouched); we use a kludge to detect and ++ * denote these. ++ */ ++#define MIN_DUMP_ARG0 0x00 ++#define MAX_DUMP_ARG0 0x0a /* 0x0b is useful too but hangs old EC firmware */ ++static ssize_t show_battery_dump( ++ struct device *dev, struct device_attribute *attr, char *buf) ++{ ++ int i; ++ char *p = buf; ++ int bat = attr_get_bat(attr); ++ u8 arg0; /* first argument to EC */ ++ u8 rowa[TP_CONTROLLER_ROW_LEN], ++ rowb[TP_CONTROLLER_ROW_LEN]; ++ const u8 junka = 0xAA, ++ junkb = 0x55; /* junk values for testing changes */ ++ int ret; ++ ++ for (arg0 = MIN_DUMP_ARG0; arg0 <= MAX_DUMP_ARG0; ++arg0) { ++ if ((p-buf) > PAGE_SIZE-TP_CONTROLLER_ROW_LEN*5) ++ return -ENOMEM; /* don't overflow sysfs buf */ ++ /* Read raw twice with different junk values, ++ * to detect unused output bytes which are left unchaged: */ ++ ret = read_tp_ec_row(arg0, bat, junka, rowa); ++ if (ret) ++ return ret; ++ ret = read_tp_ec_row(arg0, bat, junkb, rowb); ++ if (ret) ++ return ret; ++ for (i = 0; i < TP_CONTROLLER_ROW_LEN; i++) { ++ if (rowa[i] == junka && rowb[i] == junkb) ++ p += sprintf(p, "-- "); /* unused by EC */ ++ else ++ p += sprintf(p, "%02x ", rowa[i]); ++ } ++ p += sprintf(p, "\n"); ++ } ++ return p-buf; ++} ++ ++ ++/********************************************************************* ++ * sysfs attribute I/O, other than batteries ++ */ ++ ++static ssize_t show_ac_connected( ++ struct device *dev, struct device_attribute *attr, char *buf) ++{ ++ int ret = power_device_present(0xFF); ++ if (ret < 0) ++ return ret; ++ return sprintf(buf, "%d\n", ret); /* type: boolean */ ++} ++ ++/********************************************************************* ++ * The the "smapi_request" sysfs attribute executes a raw SMAPI call. ++ * You write to make a request and read to get the result. The state ++ * is saved globally rather than per fd (sysfs limitation), so ++ * simultaenous requests may get each other's results! So this is for ++ * development and debugging only. ++ */ ++#define MAX_SMAPI_ATTR_ANSWER_LEN 128 ++static char smapi_attr_answer[MAX_SMAPI_ATTR_ANSWER_LEN] = ""; ++ ++static ssize_t show_smapi_request(struct device *dev, ++ struct device_attribute *attr, char *buf) ++{ ++ int ret = snprintf(buf, PAGE_SIZE, "%s", smapi_attr_answer); ++ smapi_attr_answer[0] = '\0'; ++ return ret; ++} ++ ++static ssize_t store_smapi_request(struct device *dev, ++ struct device_attribute *attr, ++ const char *buf, size_t count) ++{ ++ unsigned int inEBX, inECX, inEDI, inESI; ++ u32 outEBX, outECX, outEDX, outEDI, outESI; ++ const char *msg; ++ int ret; ++ if (sscanf(buf, "%x %x %x %x", &inEBX, &inECX, &inEDI, &inESI) != 4) { ++ smapi_attr_answer[0] = '\0'; ++ return -EINVAL; ++ } ++ ret = smapi_request( ++ inEBX, inECX, inEDI, inESI, ++ &outEBX, &outECX, &outEDX, &outEDI, &outESI, &msg); ++ snprintf(smapi_attr_answer, MAX_SMAPI_ATTR_ANSWER_LEN, ++ "%x %x %x %x %x %d '%s'\n", ++ (unsigned int)outEBX, (unsigned int)outECX, ++ (unsigned int)outEDX, (unsigned int)outEDI, ++ (unsigned int)outESI, ret, msg); ++ if (ret) ++ return ret; ++ else ++ return count; ++} ++ ++/********************************************************************* ++ * Power management: the embedded controller forgets the battery ++ * thresholds when the system is suspended to disk and unplugged from ++ * AC and battery, so we restore it upon resume. ++ */ ++ ++static int saved_threshs[4] = {-1, -1, -1, -1}; /* -1 = don't know */ ++ ++static int tp_suspend(struct platform_device *dev, pm_message_t state) ++{ ++ int restore = (state.event == PM_EVENT_HIBERNATE || ++ state.event == PM_EVENT_FREEZE); ++ if (!restore || get_real_thresh(0, THRESH_STOP , &saved_threshs[0])) ++ saved_threshs[0] = -1; ++ if (!restore || get_real_thresh(0, THRESH_START, &saved_threshs[1])) ++ saved_threshs[1] = -1; ++ if (!restore || get_real_thresh(1, THRESH_STOP , &saved_threshs[2])) ++ saved_threshs[2] = -1; ++ if (!restore || get_real_thresh(1, THRESH_START, &saved_threshs[3])) ++ saved_threshs[3] = -1; ++ DPRINTK("suspend saved: %d %d %d %d", saved_threshs[0], ++ saved_threshs[1], saved_threshs[2], saved_threshs[3]); ++ return 0; ++} ++ ++static int tp_resume(struct platform_device *dev) ++{ ++ DPRINTK("resume restoring: %d %d %d %d", saved_threshs[0], ++ saved_threshs[1], saved_threshs[2], saved_threshs[3]); ++ if (saved_threshs[0] >= 0) ++ set_real_thresh(0, THRESH_STOP , saved_threshs[0]); ++ if (saved_threshs[1] >= 0) ++ set_real_thresh(0, THRESH_START, saved_threshs[1]); ++ if (saved_threshs[2] >= 0) ++ set_real_thresh(1, THRESH_STOP , saved_threshs[2]); ++ if (saved_threshs[3] >= 0) ++ set_real_thresh(1, THRESH_START, saved_threshs[3]); ++ return 0; ++} ++ ++ ++/********************************************************************* ++ * Driver model ++ */ ++ ++static struct platform_driver tp_driver = { ++ .suspend = tp_suspend, ++ .resume = tp_resume, ++ .driver = { ++ .name = "smapi", ++ .owner = THIS_MODULE ++ }, ++}; ++ ++ ++/********************************************************************* ++ * Sysfs device model ++ */ ++ ++/* Attributes in /sys/devices/platform/smapi/ */ ++ ++static DEVICE_ATTR(ac_connected, 0444, show_ac_connected, NULL); ++static DEVICE_ATTR(smapi_request, 0600, show_smapi_request, ++ store_smapi_request); ++ ++static struct attribute *tp_root_attributes[] = { ++ &dev_attr_ac_connected.attr, ++ &dev_attr_smapi_request.attr, ++ NULL ++}; ++static struct attribute_group tp_root_attribute_group = { ++ .attrs = tp_root_attributes ++}; ++ ++/* Attributes under /sys/devices/platform/smapi/BAT{0,1}/ : ++ * Every attribute needs to be defined (i.e., statically allocated) for ++ * each battery, and then referenced in the attribute list of each battery. ++ * We use preprocessor voodoo to avoid duplicating the list of attributes 4 ++ * times. The preprocessor output is just normal sysfs attributes code. ++ */ ++ ++/** ++ * FOREACH_BAT_ATTR - invoke the given macros on all our battery attributes ++ * @_BAT: battery number (0 or 1) ++ * @_ATTR_RW: macro to invoke for each read/write attribute ++ * @_ATTR_R: macro to invoke for each read-only attribute ++ */ ++#define FOREACH_BAT_ATTR(_BAT, _ATTR_RW, _ATTR_R) \ ++ _ATTR_RW(_BAT, start_charge_thresh) \ ++ _ATTR_RW(_BAT, stop_charge_thresh) \ ++ _ATTR_RW(_BAT, inhibit_charge_minutes) \ ++ _ATTR_RW(_BAT, force_discharge) \ ++ _ATTR_R(_BAT, installed) \ ++ _ATTR_R(_BAT, state) \ ++ _ATTR_R(_BAT, manufacturer) \ ++ _ATTR_R(_BAT, model) \ ++ _ATTR_R(_BAT, barcoding) \ ++ _ATTR_R(_BAT, chemistry) \ ++ _ATTR_R(_BAT, voltage) \ ++ _ATTR_R(_BAT, group0_voltage) \ ++ _ATTR_R(_BAT, group1_voltage) \ ++ _ATTR_R(_BAT, group2_voltage) \ ++ _ATTR_R(_BAT, group3_voltage) \ ++ _ATTR_R(_BAT, current_now) \ ++ _ATTR_R(_BAT, current_avg) \ ++ _ATTR_R(_BAT, charging_max_current) \ ++ _ATTR_R(_BAT, power_now) \ ++ _ATTR_R(_BAT, power_avg) \ ++ _ATTR_R(_BAT, remaining_percent) \ ++ _ATTR_R(_BAT, remaining_percent_error) \ ++ _ATTR_R(_BAT, remaining_charging_time) \ ++ _ATTR_R(_BAT, remaining_running_time) \ ++ _ATTR_R(_BAT, remaining_running_time_now) \ ++ _ATTR_R(_BAT, remaining_capacity) \ ++ _ATTR_R(_BAT, last_full_capacity) \ ++ _ATTR_R(_BAT, design_voltage) \ ++ _ATTR_R(_BAT, charging_max_voltage) \ ++ _ATTR_R(_BAT, design_capacity) \ ++ _ATTR_R(_BAT, cycle_count) \ ++ _ATTR_R(_BAT, temperature) \ ++ _ATTR_R(_BAT, serial) \ ++ _ATTR_R(_BAT, manufacture_date) \ ++ _ATTR_R(_BAT, first_use_date) \ ++ _ATTR_R(_BAT, dump) ++ ++/* Define several macros we will feed into FOREACH_BAT_ATTR: */ ++ ++#define DEFINE_BAT_ATTR_RW(_BAT,_NAME) \ ++ static struct bat_device_attribute dev_attr_##_NAME##_##_BAT = { \ ++ .dev_attr = __ATTR(_NAME, 0644, show_battery_##_NAME, \ ++ store_battery_##_NAME), \ ++ .bat = _BAT \ ++ }; ++ ++#define DEFINE_BAT_ATTR_R(_BAT,_NAME) \ ++ static struct bat_device_attribute dev_attr_##_NAME##_##_BAT = { \ ++ .dev_attr = __ATTR(_NAME, 0644, show_battery_##_NAME, 0), \ ++ .bat = _BAT \ ++ }; ++ ++#define REF_BAT_ATTR(_BAT,_NAME) \ ++ &dev_attr_##_NAME##_##_BAT.dev_attr.attr, ++ ++/* This provide all attributes for one battery: */ ++ ++#define PROVIDE_BAT_ATTRS(_BAT) \ ++ FOREACH_BAT_ATTR(_BAT, DEFINE_BAT_ATTR_RW, DEFINE_BAT_ATTR_R) \ ++ static struct attribute *tp_bat##_BAT##_attributes[] = { \ ++ FOREACH_BAT_ATTR(_BAT, REF_BAT_ATTR, REF_BAT_ATTR) \ ++ NULL \ ++ }; \ ++ static struct attribute_group tp_bat##_BAT##_attribute_group = { \ ++ .name = "BAT" #_BAT, \ ++ .attrs = tp_bat##_BAT##_attributes \ ++ }; ++ ++/* Finally genereate the attributes: */ ++ ++PROVIDE_BAT_ATTRS(0) ++PROVIDE_BAT_ATTRS(1) ++ ++/* List of attribute groups */ ++ ++static struct attribute_group *attr_groups[] = { ++ &tp_root_attribute_group, ++ &tp_bat0_attribute_group, ++ &tp_bat1_attribute_group, ++ NULL ++}; ++ ++ ++/********************************************************************* ++ * Init and cleanup ++ */ ++ ++static struct attribute_group **next_attr_group; /* next to register */ ++ ++static int __init tp_init(void) ++{ ++ int ret; ++ printk(KERN_INFO "tp_smapi " TP_VERSION " loading...\n"); ++ ++ ret = find_smapi_port(); ++ if (ret < 0) ++ goto err; ++ else ++ smapi_port = ret; ++ ++ if (!request_region(smapi_port, 1, "smapi")) { ++ printk(KERN_ERR "tp_smapi cannot claim port 0x%x\n", ++ smapi_port); ++ ret = -ENXIO; ++ goto err; ++ } ++ ++ if (!request_region(SMAPI_PORT2, 1, "smapi")) { ++ printk(KERN_ERR "tp_smapi cannot claim port 0x%x\n", ++ SMAPI_PORT2); ++ ret = -ENXIO; ++ goto err_port1; ++ } ++ ++ ret = platform_driver_register(&tp_driver); ++ if (ret) ++ goto err_port2; ++ ++ pdev = platform_device_alloc("smapi", -1); ++ if (!pdev) { ++ ret = -ENOMEM; ++ goto err_driver; ++ } ++ ++ ret = platform_device_add(pdev); ++ if (ret) ++ goto err_device_free; ++ ++ for (next_attr_group = attr_groups; *next_attr_group; ++ ++next_attr_group) { ++ ret = sysfs_create_group(&pdev->dev.kobj, *next_attr_group); ++ if (ret) ++ goto err_attr; ++ } ++ ++ printk(KERN_INFO "tp_smapi successfully loaded (smapi_port=0x%x).\n", ++ smapi_port); ++ return 0; ++ ++err_attr: ++ while (--next_attr_group >= attr_groups) ++ sysfs_remove_group(&pdev->dev.kobj, *next_attr_group); ++ platform_device_unregister(pdev); ++err_device_free: ++ platform_device_put(pdev); ++err_driver: ++ platform_driver_unregister(&tp_driver); ++err_port2: ++ release_region(SMAPI_PORT2, 1); ++err_port1: ++ release_region(smapi_port, 1); ++err: ++ printk(KERN_ERR "tp_smapi init failed (ret=%d)!\n", ret); ++ return ret; ++} ++ ++static void __exit tp_exit(void) ++{ ++ while (next_attr_group && --next_attr_group >= attr_groups) ++ sysfs_remove_group(&pdev->dev.kobj, *next_attr_group); ++ platform_device_unregister(pdev); ++ platform_driver_unregister(&tp_driver); ++ release_region(SMAPI_PORT2, 1); ++ if (smapi_port) ++ release_region(smapi_port, 1); ++ ++ printk(KERN_INFO "tp_smapi unloaded.\n"); ++} ++ ++module_init(tp_init); ++module_exit(tp_exit); +diff --git a/drivers/tty/Kconfig b/drivers/tty/Kconfig +index 0840d27381ea..73aba9a31064 100644 +--- a/drivers/tty/Kconfig ++++ b/drivers/tty/Kconfig +@@ -75,6 +75,19 @@ config VT_CONSOLE_SLEEP + def_bool y + depends on VT_CONSOLE && PM_SLEEP + ++config NR_TTY_DEVICES ++ int "Maximum tty device number" ++ depends on VT ++ range 12 63 ++ default 63 ++ ---help--- ++ This option is used to change the number of tty devices in /dev. ++ The default value is 63. The lowest number you can set is 12, ++ 63 is also the upper limit so we don't overrun the serial ++ consoles. ++ ++ If unsure, say 63. ++ + config HW_CONSOLE + bool + depends on VT && !UML +diff --git a/fs/exec.c b/fs/exec.c +index 65eaacaba4f4..1d3b310bd5f0 100644 +--- a/fs/exec.c ++++ b/fs/exec.c +@@ -63,6 +63,8 @@ + #include + #include + ++#include ++ + #include + #include + #include +@@ -866,9 +868,12 @@ static struct file *do_open_execat(int fd, struct filename *name, int flags) + if (err) + goto exit; + +- if (name->name[0] != '\0') ++ if (name->name[0] != '\0') { + fsnotify_open(file); + ++ trace_open_exec(name->name); ++ } ++ + out: + return file; + +diff --git a/fs/open.c b/fs/open.c +index cb81623a8b09..a92b0f6061ac 100644 +--- a/fs/open.c ++++ b/fs/open.c +@@ -34,6 +34,9 @@ + + #include "internal.h" + ++#define CREATE_TRACE_POINTS ++#include ++ + int do_truncate(struct dentry *dentry, loff_t length, unsigned int time_attrs, + struct file *filp) + { +@@ -1068,6 +1071,7 @@ long do_sys_open(int dfd, const char __user *filename, int flags, umode_t mode) + } else { + fsnotify_open(f); + fd_install(fd, f); ++ trace_do_sys_open(tmp->name, flags, mode); + } + } + putname(tmp); +diff --git a/include/trace/events/fs.h b/include/trace/events/fs.h +new file mode 100644 +index 000000000000..fb634b74adf3 +--- /dev/null ++++ b/include/trace/events/fs.h +@@ -0,0 +1,53 @@ ++#undef TRACE_SYSTEM ++#define TRACE_SYSTEM fs ++ ++#if !defined(_TRACE_FS_H) || defined(TRACE_HEADER_MULTI_READ) ++#define _TRACE_FS_H ++ ++#include ++#include ++ ++TRACE_EVENT(do_sys_open, ++ ++ TP_PROTO(const char *filename, int flags, int mode), ++ ++ TP_ARGS(filename, flags, mode), ++ ++ TP_STRUCT__entry( ++ __string( filename, filename ) ++ __field( int, flags ) ++ __field( int, mode ) ++ ), ++ ++ TP_fast_assign( ++ __assign_str(filename, filename); ++ __entry->flags = flags; ++ __entry->mode = mode; ++ ), ++ ++ TP_printk("\"%s\" %x %o", ++ __get_str(filename), __entry->flags, __entry->mode) ++); ++ ++TRACE_EVENT(open_exec, ++ ++ TP_PROTO(const char *filename), ++ ++ TP_ARGS(filename), ++ ++ TP_STRUCT__entry( ++ __string( filename, filename ) ++ ), ++ ++ TP_fast_assign( ++ __assign_str(filename, filename); ++ ), ++ ++ TP_printk("\"%s\"", ++ __get_str(filename)) ++); ++ ++#endif /* _TRACE_FS_H */ ++ ++/* This part must be outside protection */ ++#include +diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h +index 79226ca8f80f..2a30060e7e1d 100644 +--- a/include/linux/blkdev.h ++++ b/include/linux/blkdev.h +@@ -47,7 +47,11 @@ struct blk_queue_stats; + struct blk_stat_callback; + + #define BLKDEV_MIN_RQ 4 ++#ifdef CONFIG_ZENIFY ++#define BLKDEV_MAX_RQ 512 ++#else + #define BLKDEV_MAX_RQ 128 /* Default maximum */ ++#endif + + /* Must be consistent with blk_mq_poll_stats_bkt() */ + #define BLK_MQ_POLL_STATS_BKTS 16 +diff --git a/include/linux/thinkpad_ec.h b/include/linux/thinkpad_ec.h +new file mode 100644 +index 000000000000..1b80d7ee5493 +--- /dev/null ++++ b/include/linux/thinkpad_ec.h +@@ -0,0 +1,47 @@ ++/* ++ * thinkpad_ec.h - interface to ThinkPad embedded controller LPC3 functions ++ * ++ * Copyright (C) 2005 Shem Multinymous ++ * ++ * This program is free software; you can redistribute it and/or modify ++ * it under the terms of the GNU General Public License as published by ++ * the Free Software Foundation; either version 2 of the License, or ++ * (at your option) any later version. ++ * ++ * This program is distributed in the hope that it will be useful, ++ * but WITHOUT ANY WARRANTY; without even the implied warranty of ++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ++ * GNU General Public License for more details. ++ * ++ * You should have received a copy of the GNU General Public License ++ * along with this program; if not, write to the Free Software ++ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA ++ */ ++ ++#ifndef _THINKPAD_EC_H ++#define _THINKPAD_EC_H ++ ++#ifdef __KERNEL__ ++ ++#define TP_CONTROLLER_ROW_LEN 16 ++ ++/* EC transactions input and output (possibly partial) vectors of 16 bytes. */ ++struct thinkpad_ec_row { ++ u16 mask; /* bitmap of which entries of val[] are meaningful */ ++ u8 val[TP_CONTROLLER_ROW_LEN]; ++}; ++ ++extern int __must_check thinkpad_ec_lock(void); ++extern int __must_check thinkpad_ec_try_lock(void); ++extern void thinkpad_ec_unlock(void); ++ ++extern int thinkpad_ec_read_row(const struct thinkpad_ec_row *args, ++ struct thinkpad_ec_row *data); ++extern int thinkpad_ec_try_read_row(const struct thinkpad_ec_row *args, ++ struct thinkpad_ec_row *mask); ++extern int thinkpad_ec_prefetch_row(const struct thinkpad_ec_row *args); ++extern void thinkpad_ec_invalidate(void); ++ ++ ++#endif /* __KERNEL */ ++#endif /* _THINKPAD_EC_H */ +diff --git a/include/uapi/linux/vt.h b/include/uapi/linux/vt.h +index e9d39c48520a..3bceead8da40 100644 +--- a/include/uapi/linux/vt.h ++++ b/include/uapi/linux/vt.h +@@ -3,12 +3,25 @@ + #define _UAPI_LINUX_VT_H + + ++/* ++ * We will make this definition solely for the purpose of making packages ++ * such as splashutils build, because they can not understand that ++ * NR_TTY_DEVICES is defined in the kernel configuration. ++ */ ++#ifndef CONFIG_NR_TTY_DEVICES ++#define CONFIG_NR_TTY_DEVICES 63 ++#endif ++ + /* + * These constants are also useful for user-level apps (e.g., VC + * resizing). + */ + #define MIN_NR_CONSOLES 1 /* must be at least 1 */ +-#define MAX_NR_CONSOLES 63 /* serial lines start at 64 */ ++/* ++ * NR_TTY_DEVICES: ++ * Value MUST be at least 12 and must never be higher then 63 ++ */ ++#define MAX_NR_CONSOLES CONFIG_NR_TTY_DEVICES /* serial lines start above this */ + /* Note: the ioctl VT_GETSTATE does not work for + consoles 16 and higher (since it returns a short) */ + +diff --git a/init/Kconfig b/init/Kconfig +index 041f3a022122..5ed70eb1ad3a 100644 +--- a/init/Kconfig ++++ b/init/Kconfig +@@ -45,6 +45,38 @@ config THREAD_INFO_IN_TASK + + menu "General setup" + ++config ZENIFY ++ bool "A selection of patches from Zen/Liquorix kernel and additional tweaks for a better gaming experience" ++ default y ++ help ++ Tunes the kernel for responsiveness at the cost of throughput and power usage. ++ ++ --- Virtual Memory Subsystem --------------------------- ++ ++ Mem dirty before bg writeback..: 10 % -> 20 % ++ Mem dirty before sync writeback: 20 % -> 50 % ++ ++ --- Block Layer ---------------------------------------- ++ ++ Queue depth...............: 128 -> 512 ++ Default MQ scheduler......: mq-deadline -> bfq ++ ++ --- CFS CPU Scheduler ---------------------------------- ++ ++ Scheduling latency.............: 6 -> 3 ms ++ Minimal granularity............: 0.75 -> 0.3 ms ++ Wakeup granularity.............: 1 -> 0.5 ms ++ CPU migration cost.............: 0.5 -> 0.25 ms ++ Bandwidth slice size...........: 5 -> 3 ms ++ Ondemand fine upscaling limit..: 95 % -> 85 % ++ ++ --- MuQSS CPU Scheduler -------------------------------- ++ ++ Scheduling interval............: 6 -> 3 ms ++ ISO task max realtime use......: 70 % -> 25 % ++ Ondemand coarse upscaling limit: 80 % -> 45 % ++ Ondemand fine upscaling limit..: 95 % -> 45 % ++ + config BROKEN + bool + +@@ -1026,6 +1058,13 @@ config CC_OPTIMIZE_FOR_PERFORMANCE + with the "-O2" compiler flag for best performance and most + helpful compile-time warnings. + ++config CC_OPTIMIZE_HARDER ++ bool "Optimize harder" ++ help ++ This option will pass "-O3" to your compiler resulting in a ++ larger and faster kernel. The more complex optimizations also ++ increase compilation time and may affect stability. ++ + config CC_OPTIMIZE_FOR_SIZE + bool "Optimize for size" + help +diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c +index 2f0a0be4d344..bada807c7e59 100644 +--- a/kernel/sched/fair.c ++++ b/kernel/sched/fair.c +@@ -37,8 +37,13 @@ + * + * (default: 6ms * (1 + ilog(ncpus)), units: nanoseconds) + */ ++#ifdef CONFIG_ZENIFY ++unsigned int sysctl_sched_latency = 3000000ULL; ++static unsigned int normalized_sysctl_sched_latency = 3000000ULL; ++#else + unsigned int sysctl_sched_latency = 6000000ULL; + static unsigned int normalized_sysctl_sched_latency = 6000000ULL; ++#endif + + /* + * The initial- and re-scaling of tunables is configurable +@@ -58,13 +63,22 @@ enum sched_tunable_scaling sysctl_sched_tunable_scaling = SCHED_TUNABLESCALING_L + * + * (default: 0.75 msec * (1 + ilog(ncpus)), units: nanoseconds) + */ ++#ifdef CONFIG_ZENIFY ++unsigned int sysctl_sched_min_granularity = 300000ULL; ++static unsigned int normalized_sysctl_sched_min_granularity = 300000ULL; ++#else + unsigned int sysctl_sched_min_granularity = 750000ULL; + static unsigned int normalized_sysctl_sched_min_granularity = 750000ULL; ++#endif + + /* + * This value is kept at sysctl_sched_latency/sysctl_sched_min_granularity + */ ++#ifdef CONFIG_ZENIFY ++static unsigned int sched_nr_latency = 10; ++#else + static unsigned int sched_nr_latency = 8; ++#endif + + /* + * After fork, child runs first. If set to 0 (default) then +@@ -81,10 +95,17 @@ unsigned int sysctl_sched_child_runs_first __read_mostly; + * + * (default: 1 msec * (1 + ilog(ncpus)), units: nanoseconds) + */ ++#ifdef CONFIG_ZENIFY ++unsigned int sysctl_sched_wakeup_granularity = 500000UL; ++static unsigned int normalized_sysctl_sched_wakeup_granularity = 500000UL; ++ ++const_debug unsigned int sysctl_sched_migration_cost = 50000UL; ++#else + unsigned int sysctl_sched_wakeup_granularity = 1000000UL; + static unsigned int normalized_sysctl_sched_wakeup_granularity = 1000000UL; + + const_debug unsigned int sysctl_sched_migration_cost = 500000UL; ++#endif + + #ifdef CONFIG_SMP + /* +@@ -107,8 +128,12 @@ int __weak arch_asym_cpu_priority(int cpu) + * + * (default: 5 msec, units: microseconds) + */ ++#ifdef CONFIG_ZENIFY ++unsigned int sysctl_sched_cfs_bandwidth_slice = 3000UL; ++#else + unsigned int sysctl_sched_cfs_bandwidth_slice = 5000UL; + #endif ++#endif + + /* + * The margin used when comparing utilization with CPU capacity: +diff --git a/mm/page-writeback.c b/mm/page-writeback.c +index 337c6afb3345..9315e358f292 100644 +--- a/mm/page-writeback.c ++++ b/mm/page-writeback.c +@@ -71,7 +71,11 @@ static long ratelimit_pages = 32; + /* + * Start background writeback (via writeback threads) at this percentage + */ ++#ifdef CONFIG_ZENIFY ++int dirty_background_ratio = 20; ++#else + int dirty_background_ratio = 10; ++#endif + + /* + * dirty_background_bytes starts at 0 (disabled) so that it is a function of +@@ -88,7 +92,11 @@ int vm_highmem_is_dirtyable; + /* + * The generator of dirty data starts writeback at this percentage + */ ++#ifdef CONFIG_ZENIFY ++int vm_dirty_ratio = 50; ++#else + int vm_dirty_ratio = 20; ++#endif + + /* + * vm_dirty_bytes starts at 0 (disabled) so that it is a function of +diff --git a/net/ipv4/Kconfig b/net/ipv4/Kconfig +index 80dad301361d..42b7fa7d01f8 100644 +--- a/net/ipv4/Kconfig ++++ b/net/ipv4/Kconfig +@@ -702,6 +702,9 @@ choice + config DEFAULT_VEGAS + bool "Vegas" if TCP_CONG_VEGAS=y + ++ config DEFAULT_YEAH ++ bool "YeAH" if TCP_CONG_YEAH=y ++ + config DEFAULT_VENO + bool "Veno" if TCP_CONG_VENO=y + +@@ -735,6 +738,7 @@ config DEFAULT_TCP_CONG + default "htcp" if DEFAULT_HTCP + default "hybla" if DEFAULT_HYBLA + default "vegas" if DEFAULT_VEGAS ++ default "yeah" if DEFAULT_YEAH + default "westwood" if DEFAULT_WESTWOOD + default "veno" if DEFAULT_VENO + default "reno" if DEFAULT_RENO + +From: Nick Desaulniers +Date: Mon, 24 Dec 2018 13:37:41 +0200 +Subject: include/linux/compiler*.h: define asm_volatile_goto + +asm_volatile_goto should also be defined for other compilers that +support asm goto. + +Fixes commit 815f0dd ("include/linux/compiler*.h: make compiler-*.h +mutually exclusive"). + +Signed-off-by: Nick Desaulniers +Signed-off-by: Miguel Ojeda + +diff --git a/include/linux/compiler_types.h b/include/linux/compiler_types.h +index ba814f1..e77eeb0 100644 +--- a/include/linux/compiler_types.h ++++ b/include/linux/compiler_types.h +@@ -188,6 +188,10 @@ struct ftrace_likely_data { + #define asm_volatile_goto(x...) asm goto(x) + #endif + ++#ifndef asm_volatile_goto ++#define asm_volatile_goto(x...) asm goto(x) ++#endif ++ + /* Are two types/vars the same type (ignoring qualifiers)? */ + #define __same_type(a, b) __builtin_types_compatible_p(typeof(a), typeof(b)) + +From: Andy Lavr +Date: Mon, 24 Dec 2018 14:57:47 +0200 +Subject: avl: Use [defer+madvise] as default khugepaged defrag strategy + +For some reason, the default strategy to respond to THP fault fallbacks +is still just madvise, meaning stall if the program wants transparent +hugepages, but don't trigger a background reclaim / compaction if THP +begins to fail allocations. This creates a snowball affect where we +still use the THP code paths, but we almost always fail once a system +has been active and busy for a while. + +The option "defer" was created for interactive systems where THP can +still improve performance. If we have to fallback to a regular page due +to an allocation failure or anything else, we will trigger a background +reclaim and compaction so future THP attempts succeed and previous +attempts eventually have their smaller pages combined without stalling +running applications. + +We still want madvise to stall applications that explicitely want THP, +so defer+madvise _does_ make a ton of sense. Make it the default for +interactive systems, especially if the kernel maintainer left +transparent hugepages on "always". + +Reasoning and details in the original patch: +https://lwn.net/Articles/711248/ + +Signed-off-by: Andy Lavr + +diff --git a/mm/huge_memory.c b/mm/huge_memory.c +index e84a10b..21d62b7 100644 +--- a/mm/huge_memory.c ++++ b/mm/huge_memory.c +@@ -53,7 +53,11 @@ unsigned long transparent_hugepage_flags __read_mostly = + #ifdef CONFIG_TRANSPARENT_HUGEPAGE_MADVISE + (1<hw.mac; + struct e1000_phy_info *phy = &adapter->hw.phy; + struct e1000_ring *tx_ring = adapter->tx_ring; +- u32 dmoff_exit_timeout = 100, tries = 0; + struct e1000_hw *hw = &adapter->hw; ++ u32 link, tctl; +- u32 link, tctl, pcim_state; + + if (test_bit(__E1000_DOWN, &adapter->state)) + return; +@@ -5188,21 +5187,6 @@ static void e1000_watchdog_task(struct work_struct *work) + /* Cancel scheduled suspend requests. */ + pm_runtime_resume(netdev->dev.parent); + +- /* Checking if MAC is in DMoff state*/ +- pcim_state = er32(STATUS); +- while (pcim_state & E1000_STATUS_PCIM_STATE) { +- if (tries++ == dmoff_exit_timeout) { +- e_dbg("Error in exiting dmoff\n"); +- break; +- } +- usleep_range(10000, 20000); +- pcim_state = er32(STATUS); +- +- /* Checking if MAC exited DMoff state */ +- if (!(pcim_state & E1000_STATUS_PCIM_STATE)) +- e1000_phy_hw_reset(&adapter->hw); +- } +- + /* update snapshot of PHY registers on LSC */ + e1000_phy_read_status(adapter); + mac->ops.get_link_up_info(&adapter->hw, +From adb1f9df27f08e6488bcd80b1607987c6114a77a Mon Sep 17 00:00:00 2001 +From: Alexandre Frade +Date: Mon, 25 Nov 2019 15:13:06 -0300 +Subject: [PATCH] elevator: set default scheduler to bfq for blk-mq + +Signed-off-by: Alexandre Frade +--- + block/elevator.c | 6 +++--- + 1 file changed, 3 insertions(+), 3 deletions(-) + +diff --git a/block/elevator.c b/block/elevator.c +index 076ba7308e65..81f89095aa77 100644 +--- a/block/elevator.c ++++ b/block/elevator.c +@@ -623,15 +623,15 @@ static inline bool elv_support_iosched(struct request_queue *q) + } + + /* +- * For single queue devices, default to using mq-deadline. If we have multiple +- * queues or mq-deadline is not available, default to "none". ++ * For single queue devices, default to using bfq. If we have multiple ++ * queues or bfq is not available, default to "none". + */ + static struct elevator_type *elevator_get_default(struct request_queue *q) + { + if (q->nr_hw_queues != 1) + return NULL; + +- return elevator_get(q, "mq-deadline", false); ++ return elevator_get(q, "bfq", false); + } + + /* +From c3ec05777c46e19a8a26d0fc4ca0c0db8a19de97 Mon Sep 17 00:00:00 2001 +From: Alexandre Frade +Date: Fri, 10 May 2019 16:45:59 -0300 +Subject: [PATCH] block: set rq_affinity = 2 for full multithreading I/O + requests + +Signed-off-by: Alexandre Frade +--- + include/linux/blkdev.h | 3 ++- + 1 file changed, 2 insertions(+), 1 deletion(-) + +diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h +index f3ea78b0c91c..4dbacc6b073b 100644 +--- a/include/linux/blkdev.h ++++ b/include/linux/blkdev.h +@@ -621,7 +621,8 @@ struct request_queue { + #define QUEUE_FLAG_RQ_ALLOC_TIME 27 /* record rq->alloc_time_ns */ + + #define QUEUE_FLAG_MQ_DEFAULT ((1 << QUEUE_FLAG_IO_STAT) | \ +- (1 << QUEUE_FLAG_SAME_COMP)) ++ (1 << QUEUE_FLAG_SAME_COMP) | \ ++ (1 << QUEUE_FLAG_SAME_FORCE)) + + void blk_queue_flag_set(unsigned int flag, struct request_queue *q); + void blk_queue_flag_clear(unsigned int flag, struct request_queue *q); +From 8171d33d0b84a953649863538fdbe4c26c035e4f Mon Sep 17 00:00:00 2001 +From: Alexandre Frade +Date: Fri, 10 May 2019 14:32:50 -0300 +Subject: [PATCH] mm: set 2 megabytes for address_space-level file read-ahead + pages size + +Signed-off-by: Alexandre Frade +--- + include/linux/mm.h | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +diff --git a/include/linux/mm.h b/include/linux/mm.h +index a2adf95b3f9c..e804d9f7583a 100644 +--- a/include/linux/mm.h ++++ b/include/linux/mm.h +@@ -2416,7 +2416,7 @@ int __must_check write_one_page(struct page *page); + void task_dirty_inc(struct task_struct *tsk); + + /* readahead.c */ +-#define VM_READAHEAD_PAGES (SZ_128K / PAGE_SIZE) ++#define VM_READAHEAD_PAGES (SZ_2M / PAGE_SIZE) + + int force_page_cache_readahead(struct address_space *mapping, struct file *filp, + pgoff_t offset, unsigned long nr_to_read); diff --git a/linux54-tkg/linux54-tkg-patches/0003-glitched-cfs.patch b/linux54-tkg/linux54-tkg-patches/0003-glitched-cfs.patch new file mode 100644 index 0000000..06b7f02 --- /dev/null +++ b/linux54-tkg/linux54-tkg-patches/0003-glitched-cfs.patch @@ -0,0 +1,72 @@ +diff --git a/kernel/Kconfig.hz b/kernel/Kconfig.hz +index 2a202a846757..1d9c7ed79b11 100644 +--- a/kernel/Kconfig.hz ++++ b/kernel/Kconfig.hz +@@ -4,7 +4,7 @@ + + choice + prompt "Timer frequency" +- default HZ_250 ++ default HZ_500 + help + Allows the configuration of the timer frequency. It is customary + to have the timer interrupt run at 1000 Hz but 100 Hz may be more +@@ -39,6 +39,13 @@ choice + on SMP and NUMA systems and exactly dividing by both PAL and + NTSC frame rates for video and multimedia work. + ++ config HZ_500 ++ bool "500 HZ" ++ help ++ 500 Hz is a balanced timer frequency. Provides fast interactivity ++ on desktops with great smoothness without increasing CPU power ++ consumption and sacrificing the battery life on laptops. ++ + config HZ_1000 + bool "1000 HZ" + help +@@ -52,6 +59,7 @@ config HZ + default 100 if HZ_100 + default 250 if HZ_250 + default 300 if HZ_300 ++ default 500 if HZ_500 + default 1000 if HZ_1000 + + config SCHED_HRTICK + +diff --git a/kernel/Kconfig.hz b/kernel/Kconfig.hz +index 2a202a846757..1d9c7ed79b11 100644 +--- a/kernel/Kconfig.hz ++++ b/kernel/Kconfig.hz +@@ -4,7 +4,7 @@ + + choice + prompt "Timer frequency" +- default HZ_500 ++ default HZ_750 + help + Allows the configuration of the timer frequency. It is customary + to have the timer interrupt run at 1000 Hz but 100 Hz may be more +@@ -46,6 +46,13 @@ choice + on desktops with great smoothness without increasing CPU power + consumption and sacrificing the battery life on laptops. + ++ config HZ_750 ++ bool "750 HZ" ++ help ++ 750 Hz is a good timer frequency for desktops. Provides fast ++ interactivity with great smoothness without sacrificing too ++ much throughput. ++ + config HZ_1000 + bool "1000 HZ" + help +@@ -60,6 +67,7 @@ config HZ + default 250 if HZ_250 + default 300 if HZ_300 + default 500 if HZ_500 ++ default 750 if HZ_750 + default 1000 if HZ_1000 + + config SCHED_HRTICK + diff --git a/linux54-tkg/linux54-tkg-patches/0004-5.4-ck1.patch b/linux54-tkg/linux54-tkg-patches/0004-5.4-ck1.patch new file mode 100644 index 0000000..c42f266 --- /dev/null +++ b/linux54-tkg/linux54-tkg-patches/0004-5.4-ck1.patch @@ -0,0 +1,17679 @@ +diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt +index 8dee8f68fe15..e56fb275f607 100644 +--- a/Documentation/admin-guide/kernel-parameters.txt ++++ b/Documentation/admin-guide/kernel-parameters.txt +@@ -4277,6 +4277,14 @@ + Memory area to be used by remote processor image, + managed by CMA. + ++ rqshare= [X86] Select the MuQSS scheduler runqueue sharing type. ++ Format: ++ smt -- Share SMT (hyperthread) sibling runqueues ++ mc -- Share MC (multicore) sibling runqueues ++ smp -- Share SMP runqueues ++ none -- So not share any runqueues ++ Default value is mc ++ + rw [KNL] Mount root device read-write on boot + + S [KNL] Run init in single mode +diff --git a/Documentation/admin-guide/sysctl/kernel.rst b/Documentation/admin-guide/sysctl/kernel.rst +index 032c7cd3cede..ff41dfacb34b 100644 +--- a/Documentation/admin-guide/sysctl/kernel.rst ++++ b/Documentation/admin-guide/sysctl/kernel.rst +@@ -46,6 +46,7 @@ show up in /proc/sys/kernel: + - hung_task_check_interval_secs + - hung_task_warnings + - hyperv_record_panic_msg ++- iso_cpu + - kexec_load_disabled + - kptr_restrict + - l2cr [ PPC only ] +@@ -82,6 +83,7 @@ show up in /proc/sys/kernel: + - randomize_va_space + - real-root-dev ==> Documentation/admin-guide/initrd.rst + - reboot-cmd [ SPARC only ] ++- rr_interval + - rtsig-max + - rtsig-nr + - sched_energy_aware +@@ -105,6 +107,7 @@ show up in /proc/sys/kernel: + - unknown_nmi_panic + - watchdog + - watchdog_thresh ++- yield_type + - version + + +@@ -438,6 +441,16 @@ When kptr_restrict is set to (2), kernel pointers printed using + %pK will be replaced with 0's regardless of privileges. + + ++iso_cpu: (MuQSS CPU scheduler only) ++=================================== ++ ++This sets the percentage cpu that the unprivileged SCHED_ISO tasks can ++run effectively at realtime priority, averaged over a rolling five ++seconds over the -whole- system, meaning all cpus. ++ ++Set to 70 (percent) by default. ++ ++ + l2cr: (PPC only) + ================ + +@@ -905,6 +918,20 @@ ROM/Flash boot loader. Maybe to tell it what to do after + rebooting. ??? + + ++rr_interval: (MuQSS CPU scheduler only) ++======================================= ++ ++This is the smallest duration that any cpu process scheduling unit ++will run for. Increasing this value can increase throughput of cpu ++bound tasks substantially but at the expense of increased latencies ++overall. Conversely decreasing it will decrease average and maximum ++latencies but at the expense of throughput. This value is in ++milliseconds and the default value chosen depends on the number of ++cpus available at scheduler initialisation with a minimum of 6. ++ ++Valid values are from 1-1000. ++ ++ + rtsig-max & rtsig-nr: + ===================== + +@@ -1175,3 +1202,13 @@ is 10 seconds. + + The softlockup threshold is (2 * watchdog_thresh). Setting this + tunable to zero will disable lockup detection altogether. ++ ++ ++yield_type: (MuQSS CPU scheduler only) ++====================================== ++ ++This determines what type of yield calls to sched_yield will perform. ++ ++ 0: No yield. ++ 1: Yield only to better priority/deadline tasks. (default) ++ 2: Expire timeslice and recalculate deadline. +diff --git a/Documentation/scheduler/sched-BFS.txt b/Documentation/scheduler/sched-BFS.txt +new file mode 100644 +index 000000000000..c0282002a079 +--- /dev/null ++++ b/Documentation/scheduler/sched-BFS.txt +@@ -0,0 +1,351 @@ ++BFS - The Brain Fuck Scheduler by Con Kolivas. ++ ++Goals. ++ ++The goal of the Brain Fuck Scheduler, referred to as BFS from here on, is to ++completely do away with the complex designs of the past for the cpu process ++scheduler and instead implement one that is very simple in basic design. ++The main focus of BFS is to achieve excellent desktop interactivity and ++responsiveness without heuristics and tuning knobs that are difficult to ++understand, impossible to model and predict the effect of, and when tuned to ++one workload cause massive detriment to another. ++ ++ ++Design summary. ++ ++BFS is best described as a single runqueue, O(n) lookup, earliest effective ++virtual deadline first design, loosely based on EEVDF (earliest eligible virtual ++deadline first) and my previous Staircase Deadline scheduler. Each component ++shall be described in order to understand the significance of, and reasoning for ++it. The codebase when the first stable version was released was approximately ++9000 lines less code than the existing mainline linux kernel scheduler (in ++2.6.31). This does not even take into account the removal of documentation and ++the cgroups code that is not used. ++ ++Design reasoning. ++ ++The single runqueue refers to the queued but not running processes for the ++entire system, regardless of the number of CPUs. The reason for going back to ++a single runqueue design is that once multiple runqueues are introduced, ++per-CPU or otherwise, there will be complex interactions as each runqueue will ++be responsible for the scheduling latency and fairness of the tasks only on its ++own runqueue, and to achieve fairness and low latency across multiple CPUs, any ++advantage in throughput of having CPU local tasks causes other disadvantages. ++This is due to requiring a very complex balancing system to at best achieve some ++semblance of fairness across CPUs and can only maintain relatively low latency ++for tasks bound to the same CPUs, not across them. To increase said fairness ++and latency across CPUs, the advantage of local runqueue locking, which makes ++for better scalability, is lost due to having to grab multiple locks. ++ ++A significant feature of BFS is that all accounting is done purely based on CPU ++used and nowhere is sleep time used in any way to determine entitlement or ++interactivity. Interactivity "estimators" that use some kind of sleep/run ++algorithm are doomed to fail to detect all interactive tasks, and to falsely tag ++tasks that aren't interactive as being so. The reason for this is that it is ++close to impossible to determine that when a task is sleeping, whether it is ++doing it voluntarily, as in a userspace application waiting for input in the ++form of a mouse click or otherwise, or involuntarily, because it is waiting for ++another thread, process, I/O, kernel activity or whatever. Thus, such an ++estimator will introduce corner cases, and more heuristics will be required to ++cope with those corner cases, introducing more corner cases and failed ++interactivity detection and so on. Interactivity in BFS is built into the design ++by virtue of the fact that tasks that are waking up have not used up their quota ++of CPU time, and have earlier effective deadlines, thereby making it very likely ++they will preempt any CPU bound task of equivalent nice level. See below for ++more information on the virtual deadline mechanism. Even if they do not preempt ++a running task, because the rr interval is guaranteed to have a bound upper ++limit on how long a task will wait for, it will be scheduled within a timeframe ++that will not cause visible interface jitter. ++ ++ ++Design details. ++ ++Task insertion. ++ ++BFS inserts tasks into each relevant queue as an O(1) insertion into a double ++linked list. On insertion, *every* running queue is checked to see if the newly ++queued task can run on any idle queue, or preempt the lowest running task on the ++system. This is how the cross-CPU scheduling of BFS achieves significantly lower ++latency per extra CPU the system has. In this case the lookup is, in the worst ++case scenario, O(n) where n is the number of CPUs on the system. ++ ++Data protection. ++ ++BFS has one single lock protecting the process local data of every task in the ++global queue. Thus every insertion, removal and modification of task data in the ++global runqueue needs to grab the global lock. However, once a task is taken by ++a CPU, the CPU has its own local data copy of the running process' accounting ++information which only that CPU accesses and modifies (such as during a ++timer tick) thus allowing the accounting data to be updated lockless. Once a ++CPU has taken a task to run, it removes it from the global queue. Thus the ++global queue only ever has, at most, ++ ++ (number of tasks requesting cpu time) - (number of logical CPUs) + 1 ++ ++tasks in the global queue. This value is relevant for the time taken to look up ++tasks during scheduling. This will increase if many tasks with CPU affinity set ++in their policy to limit which CPUs they're allowed to run on if they outnumber ++the number of CPUs. The +1 is because when rescheduling a task, the CPU's ++currently running task is put back on the queue. Lookup will be described after ++the virtual deadline mechanism is explained. ++ ++Virtual deadline. ++ ++The key to achieving low latency, scheduling fairness, and "nice level" ++distribution in BFS is entirely in the virtual deadline mechanism. The one ++tunable in BFS is the rr_interval, or "round robin interval". This is the ++maximum time two SCHED_OTHER (or SCHED_NORMAL, the common scheduling policy) ++tasks of the same nice level will be running for, or looking at it the other ++way around, the longest duration two tasks of the same nice level will be ++delayed for. When a task requests cpu time, it is given a quota (time_slice) ++equal to the rr_interval and a virtual deadline. The virtual deadline is ++offset from the current time in jiffies by this equation: ++ ++ jiffies + (prio_ratio * rr_interval) ++ ++The prio_ratio is determined as a ratio compared to the baseline of nice -20 ++and increases by 10% per nice level. The deadline is a virtual one only in that ++no guarantee is placed that a task will actually be scheduled by this time, but ++it is used to compare which task should go next. There are three components to ++how a task is next chosen. First is time_slice expiration. If a task runs out ++of its time_slice, it is descheduled, the time_slice is refilled, and the ++deadline reset to that formula above. Second is sleep, where a task no longer ++is requesting CPU for whatever reason. The time_slice and deadline are _not_ ++adjusted in this case and are just carried over for when the task is next ++scheduled. Third is preemption, and that is when a newly waking task is deemed ++higher priority than a currently running task on any cpu by virtue of the fact ++that it has an earlier virtual deadline than the currently running task. The ++earlier deadline is the key to which task is next chosen for the first and ++second cases. Once a task is descheduled, it is put back on the queue, and an ++O(n) lookup of all queued-but-not-running tasks is done to determine which has ++the earliest deadline and that task is chosen to receive CPU next. ++ ++The CPU proportion of different nice tasks works out to be approximately the ++ ++ (prio_ratio difference)^2 ++ ++The reason it is squared is that a task's deadline does not change while it is ++running unless it runs out of time_slice. Thus, even if the time actually ++passes the deadline of another task that is queued, it will not get CPU time ++unless the current running task deschedules, and the time "base" (jiffies) is ++constantly moving. ++ ++Task lookup. ++ ++BFS has 103 priority queues. 100 of these are dedicated to the static priority ++of realtime tasks, and the remaining 3 are, in order of best to worst priority, ++SCHED_ISO (isochronous), SCHED_NORMAL, and SCHED_IDLEPRIO (idle priority ++scheduling). When a task of these priorities is queued, a bitmap of running ++priorities is set showing which of these priorities has tasks waiting for CPU ++time. When a CPU is made to reschedule, the lookup for the next task to get ++CPU time is performed in the following way: ++ ++First the bitmap is checked to see what static priority tasks are queued. If ++any realtime priorities are found, the corresponding queue is checked and the ++first task listed there is taken (provided CPU affinity is suitable) and lookup ++is complete. If the priority corresponds to a SCHED_ISO task, they are also ++taken in FIFO order (as they behave like SCHED_RR). If the priority corresponds ++to either SCHED_NORMAL or SCHED_IDLEPRIO, then the lookup becomes O(n). At this ++stage, every task in the runlist that corresponds to that priority is checked ++to see which has the earliest set deadline, and (provided it has suitable CPU ++affinity) it is taken off the runqueue and given the CPU. If a task has an ++expired deadline, it is taken and the rest of the lookup aborted (as they are ++chosen in FIFO order). ++ ++Thus, the lookup is O(n) in the worst case only, where n is as described ++earlier, as tasks may be chosen before the whole task list is looked over. ++ ++ ++Scalability. ++ ++The major limitations of BFS will be that of scalability, as the separate ++runqueue designs will have less lock contention as the number of CPUs rises. ++However they do not scale linearly even with separate runqueues as multiple ++runqueues will need to be locked concurrently on such designs to be able to ++achieve fair CPU balancing, to try and achieve some sort of nice-level fairness ++across CPUs, and to achieve low enough latency for tasks on a busy CPU when ++other CPUs would be more suited. BFS has the advantage that it requires no ++balancing algorithm whatsoever, as balancing occurs by proxy simply because ++all CPUs draw off the global runqueue, in priority and deadline order. Despite ++the fact that scalability is _not_ the prime concern of BFS, it both shows very ++good scalability to smaller numbers of CPUs and is likely a more scalable design ++at these numbers of CPUs. ++ ++It also has some very low overhead scalability features built into the design ++when it has been deemed their overhead is so marginal that they're worth adding. ++The first is the local copy of the running process' data to the CPU it's running ++on to allow that data to be updated lockless where possible. Then there is ++deference paid to the last CPU a task was running on, by trying that CPU first ++when looking for an idle CPU to use the next time it's scheduled. Finally there ++is the notion of cache locality beyond the last running CPU. The sched_domains ++information is used to determine the relative virtual "cache distance" that ++other CPUs have from the last CPU a task was running on. CPUs with shared ++caches, such as SMT siblings, or multicore CPUs with shared caches, are treated ++as cache local. CPUs without shared caches are treated as not cache local, and ++CPUs on different NUMA nodes are treated as very distant. This "relative cache ++distance" is used by modifying the virtual deadline value when doing lookups. ++Effectively, the deadline is unaltered between "cache local" CPUs, doubled for ++"cache distant" CPUs, and quadrupled for "very distant" CPUs. The reasoning ++behind the doubling of deadlines is as follows. The real cost of migrating a ++task from one CPU to another is entirely dependant on the cache footprint of ++the task, how cache intensive the task is, how long it's been running on that ++CPU to take up the bulk of its cache, how big the CPU cache is, how fast and ++how layered the CPU cache is, how fast a context switch is... and so on. In ++other words, it's close to random in the real world where we do more than just ++one sole workload. The only thing we can be sure of is that it's not free. So ++BFS uses the principle that an idle CPU is a wasted CPU and utilising idle CPUs ++is more important than cache locality, and cache locality only plays a part ++after that. Doubling the effective deadline is based on the premise that the ++"cache local" CPUs will tend to work on the same tasks up to double the number ++of cache local CPUs, and once the workload is beyond that amount, it is likely ++that none of the tasks are cache warm anywhere anyway. The quadrupling for NUMA ++is a value I pulled out of my arse. ++ ++When choosing an idle CPU for a waking task, the cache locality is determined ++according to where the task last ran and then idle CPUs are ranked from best ++to worst to choose the most suitable idle CPU based on cache locality, NUMA ++node locality and hyperthread sibling business. They are chosen in the ++following preference (if idle): ++ ++* Same core, idle or busy cache, idle threads ++* Other core, same cache, idle or busy cache, idle threads. ++* Same node, other CPU, idle cache, idle threads. ++* Same node, other CPU, busy cache, idle threads. ++* Same core, busy threads. ++* Other core, same cache, busy threads. ++* Same node, other CPU, busy threads. ++* Other node, other CPU, idle cache, idle threads. ++* Other node, other CPU, busy cache, idle threads. ++* Other node, other CPU, busy threads. ++ ++This shows the SMT or "hyperthread" awareness in the design as well which will ++choose a real idle core first before a logical SMT sibling which already has ++tasks on the physical CPU. ++ ++Early benchmarking of BFS suggested scalability dropped off at the 16 CPU mark. ++However this benchmarking was performed on an earlier design that was far less ++scalable than the current one so it's hard to know how scalable it is in terms ++of both CPUs (due to the global runqueue) and heavily loaded machines (due to ++O(n) lookup) at this stage. Note that in terms of scalability, the number of ++_logical_ CPUs matters, not the number of _physical_ CPUs. Thus, a dual (2x) ++quad core (4X) hyperthreaded (2X) machine is effectively a 16X. Newer benchmark ++results are very promising indeed, without needing to tweak any knobs, features ++or options. Benchmark contributions are most welcome. ++ ++ ++Features ++ ++As the initial prime target audience for BFS was the average desktop user, it ++was designed to not need tweaking, tuning or have features set to obtain benefit ++from it. Thus the number of knobs and features has been kept to an absolute ++minimum and should not require extra user input for the vast majority of cases. ++There are precisely 2 tunables, and 2 extra scheduling policies. The rr_interval ++and iso_cpu tunables, and the SCHED_ISO and SCHED_IDLEPRIO policies. In addition ++to this, BFS also uses sub-tick accounting. What BFS does _not_ now feature is ++support for CGROUPS. The average user should neither need to know what these ++are, nor should they need to be using them to have good desktop behaviour. ++ ++rr_interval ++ ++There is only one "scheduler" tunable, the round robin interval. This can be ++accessed in ++ ++ /proc/sys/kernel/rr_interval ++ ++The value is in milliseconds, and the default value is set to 6 on a ++uniprocessor machine, and automatically set to a progressively higher value on ++multiprocessor machines. The reasoning behind increasing the value on more CPUs ++is that the effective latency is decreased by virtue of there being more CPUs on ++BFS (for reasons explained above), and increasing the value allows for less ++cache contention and more throughput. Valid values are from 1 to 1000 ++Decreasing the value will decrease latencies at the cost of decreasing ++throughput, while increasing it will improve throughput, but at the cost of ++worsening latencies. The accuracy of the rr interval is limited by HZ resolution ++of the kernel configuration. Thus, the worst case latencies are usually slightly ++higher than this actual value. The default value of 6 is not an arbitrary one. ++It is based on the fact that humans can detect jitter at approximately 7ms, so ++aiming for much lower latencies is pointless under most circumstances. It is ++worth noting this fact when comparing the latency performance of BFS to other ++schedulers. Worst case latencies being higher than 7ms are far worse than ++average latencies not being in the microsecond range. ++ ++Isochronous scheduling. ++ ++Isochronous scheduling is a unique scheduling policy designed to provide ++near-real-time performance to unprivileged (ie non-root) users without the ++ability to starve the machine indefinitely. Isochronous tasks (which means ++"same time") are set using, for example, the schedtool application like so: ++ ++ schedtool -I -e amarok ++ ++This will start the audio application "amarok" as SCHED_ISO. How SCHED_ISO works ++is that it has a priority level between true realtime tasks and SCHED_NORMAL ++which would allow them to preempt all normal tasks, in a SCHED_RR fashion (ie, ++if multiple SCHED_ISO tasks are running, they purely round robin at rr_interval ++rate). However if ISO tasks run for more than a tunable finite amount of time, ++they are then demoted back to SCHED_NORMAL scheduling. This finite amount of ++time is the percentage of _total CPU_ available across the machine, configurable ++as a percentage in the following "resource handling" tunable (as opposed to a ++scheduler tunable): ++ ++ /proc/sys/kernel/iso_cpu ++ ++and is set to 70% by default. It is calculated over a rolling 5 second average ++Because it is the total CPU available, it means that on a multi CPU machine, it ++is possible to have an ISO task running as realtime scheduling indefinitely on ++just one CPU, as the other CPUs will be available. Setting this to 100 is the ++equivalent of giving all users SCHED_RR access and setting it to 0 removes the ++ability to run any pseudo-realtime tasks. ++ ++A feature of BFS is that it detects when an application tries to obtain a ++realtime policy (SCHED_RR or SCHED_FIFO) and the caller does not have the ++appropriate privileges to use those policies. When it detects this, it will ++give the task SCHED_ISO policy instead. Thus it is transparent to the user. ++Because some applications constantly set their policy as well as their nice ++level, there is potential for them to undo the override specified by the user ++on the command line of setting the policy to SCHED_ISO. To counter this, once ++a task has been set to SCHED_ISO policy, it needs superuser privileges to set ++it back to SCHED_NORMAL. This will ensure the task remains ISO and all child ++processes and threads will also inherit the ISO policy. ++ ++Idleprio scheduling. ++ ++Idleprio scheduling is a scheduling policy designed to give out CPU to a task ++_only_ when the CPU would be otherwise idle. The idea behind this is to allow ++ultra low priority tasks to be run in the background that have virtually no ++effect on the foreground tasks. This is ideally suited to distributed computing ++clients (like setiathome, folding, mprime etc) but can also be used to start ++a video encode or so on without any slowdown of other tasks. To avoid this ++policy from grabbing shared resources and holding them indefinitely, if it ++detects a state where the task is waiting on I/O, the machine is about to ++suspend to ram and so on, it will transiently schedule them as SCHED_NORMAL. As ++per the Isochronous task management, once a task has been scheduled as IDLEPRIO, ++it cannot be put back to SCHED_NORMAL without superuser privileges. Tasks can ++be set to start as SCHED_IDLEPRIO with the schedtool command like so: ++ ++ schedtool -D -e ./mprime ++ ++Subtick accounting. ++ ++It is surprisingly difficult to get accurate CPU accounting, and in many cases, ++the accounting is done by simply determining what is happening at the precise ++moment a timer tick fires off. This becomes increasingly inaccurate as the ++timer tick frequency (HZ) is lowered. It is possible to create an application ++which uses almost 100% CPU, yet by being descheduled at the right time, records ++zero CPU usage. While the main problem with this is that there are possible ++security implications, it is also difficult to determine how much CPU a task ++really does use. BFS tries to use the sub-tick accounting from the TSC clock, ++where possible, to determine real CPU usage. This is not entirely reliable, but ++is far more likely to produce accurate CPU usage data than the existing designs ++and will not show tasks as consuming no CPU usage when they actually are. Thus, ++the amount of CPU reported as being used by BFS will more accurately represent ++how much CPU the task itself is using (as is shown for example by the 'time' ++application), so the reported values may be quite different to other schedulers. ++Values reported as the 'load' are more prone to problems with this design, but ++per process values are closer to real usage. When comparing throughput of BFS ++to other designs, it is important to compare the actual completed work in terms ++of total wall clock time taken and total work done, rather than the reported ++"cpu usage". ++ ++ ++Con Kolivas Fri Aug 27 2010 +diff --git a/Documentation/scheduler/sched-MuQSS.txt b/Documentation/scheduler/sched-MuQSS.txt +new file mode 100644 +index 000000000000..ae28b85c9995 +--- /dev/null ++++ b/Documentation/scheduler/sched-MuQSS.txt +@@ -0,0 +1,373 @@ ++MuQSS - The Multiple Queue Skiplist Scheduler by Con Kolivas. ++ ++MuQSS is a per-cpu runqueue variant of the original BFS scheduler with ++one 8 level skiplist per runqueue, and fine grained locking for much more ++scalability. ++ ++ ++Goals. ++ ++The goal of the Multiple Queue Skiplist Scheduler, referred to as MuQSS from ++here on (pronounced mux) is to completely do away with the complex designs of ++the past for the cpu process scheduler and instead implement one that is very ++simple in basic design. The main focus of MuQSS is to achieve excellent desktop ++interactivity and responsiveness without heuristics and tuning knobs that are ++difficult to understand, impossible to model and predict the effect of, and when ++tuned to one workload cause massive detriment to another, while still being ++scalable to many CPUs and processes. ++ ++ ++Design summary. ++ ++MuQSS is best described as per-cpu multiple runqueue, O(log n) insertion, O(1) ++lookup, earliest effective virtual deadline first tickless design, loosely based ++on EEVDF (earliest eligible virtual deadline first) and my previous Staircase ++Deadline scheduler, and evolved from the single runqueue O(n) BFS scheduler. ++Each component shall be described in order to understand the significance of, ++and reasoning for it. ++ ++ ++Design reasoning. ++ ++In BFS, the use of a single runqueue across all CPUs meant that each CPU would ++need to scan the entire runqueue looking for the process with the earliest ++deadline and schedule that next, regardless of which CPU it originally came ++from. This made BFS deterministic with respect to latency and provided ++guaranteed latencies dependent on number of processes and CPUs. The single ++runqueue, however, meant that all CPUs would compete for the single lock ++protecting it, which would lead to increasing lock contention as the number of ++CPUs rose and appeared to limit scalability of common workloads beyond 16 ++logical CPUs. Additionally, the O(n) lookup of the runqueue list obviously ++increased overhead proportionate to the number of queued proecesses and led to ++cache thrashing while iterating over the linked list. ++ ++MuQSS is an evolution of BFS, designed to maintain the same scheduling ++decision mechanism and be virtually deterministic without relying on the ++constrained design of the single runqueue by splitting out the single runqueue ++to be per-CPU and use skiplists instead of linked lists. ++ ++The original reason for going back to a single runqueue design for BFS was that ++once multiple runqueues are introduced, per-CPU or otherwise, there will be ++complex interactions as each runqueue will be responsible for the scheduling ++latency and fairness of the tasks only on its own runqueue, and to achieve ++fairness and low latency across multiple CPUs, any advantage in throughput of ++having CPU local tasks causes other disadvantages. This is due to requiring a ++very complex balancing system to at best achieve some semblance of fairness ++across CPUs and can only maintain relatively low latency for tasks bound to the ++same CPUs, not across them. To increase said fairness and latency across CPUs, ++the advantage of local runqueue locking, which makes for better scalability, is ++lost due to having to grab multiple locks. ++ ++MuQSS works around the problems inherent in multiple runqueue designs by ++making its skip lists priority ordered and through novel use of lockless ++examination of each other runqueue it can decide if it should take the earliest ++deadline task from another runqueue for latency reasons, or for CPU balancing ++reasons. It still does not have a balancing system, choosing to allow the ++next task scheduling decision and task wakeup CPU choice to allow balancing to ++happen by virtue of its choices. ++ ++As a further evolution of the design, MuQSS normally configures sharing of ++runqueues in a logical fashion for when CPU resources are shared for improved ++latency and throughput. By default it shares runqueues and locks between ++multicore siblings. Optionally it can be configured to run with sharing of ++SMT siblings only, all SMP packages or no sharing at all. Additionally it can ++be selected at boot time. ++ ++ ++Design details. ++ ++Custom skip list implementation: ++ ++To avoid the overhead of building up and tearing down skip list structures, ++the variant used by MuQSS has a number of optimisations making it specific for ++its use case in the scheduler. It uses static arrays of 8 'levels' instead of ++building up and tearing down structures dynamically. This makes each runqueue ++only scale O(log N) up to 64k tasks. However as there is one runqueue per CPU ++it means that it scales O(log N) up to 64k x number of logical CPUs which is ++far beyond the realistic task limits each CPU could handle. By being 8 levels ++it also makes the array exactly one cacheline in size. Additionally, each ++skip list node is bidirectional making insertion and removal amortised O(1), ++being O(k) where k is 1-8. Uniquely, we are only ever interested in the very ++first entry in each list at all times with MuQSS, so there is never a need to ++do a search and thus look up is always O(1). In interactive mode, the queues ++will be searched beyond their first entry if the first task is not suitable ++for affinity or SMT nice reasons. ++ ++Task insertion: ++ ++MuQSS inserts tasks into a per CPU runqueue as an O(log N) insertion into ++a custom skip list as described above (based on the original design by William ++Pugh). Insertion is ordered in such a way that there is never a need to do a ++search by ordering tasks according to static priority primarily, and then ++virtual deadline at the time of insertion. ++ ++Niffies: ++ ++Niffies are a monotonic forward moving timer not unlike the "jiffies" but are ++of nanosecond resolution. Niffies are calculated per-runqueue from the high ++resolution TSC timers, and in order to maintain fairness are synchronised ++between CPUs whenever both runqueues are locked concurrently. ++ ++Virtual deadline: ++ ++The key to achieving low latency, scheduling fairness, and "nice level" ++distribution in MuQSS is entirely in the virtual deadline mechanism. The one ++tunable in MuQSS is the rr_interval, or "round robin interval". This is the ++maximum time two SCHED_OTHER (or SCHED_NORMAL, the common scheduling policy) ++tasks of the same nice level will be running for, or looking at it the other ++way around, the longest duration two tasks of the same nice level will be ++delayed for. When a task requests cpu time, it is given a quota (time_slice) ++equal to the rr_interval and a virtual deadline. The virtual deadline is ++offset from the current time in niffies by this equation: ++ ++ niffies + (prio_ratio * rr_interval) ++ ++The prio_ratio is determined as a ratio compared to the baseline of nice -20 ++and increases by 10% per nice level. The deadline is a virtual one only in that ++no guarantee is placed that a task will actually be scheduled by this time, but ++it is used to compare which task should go next. There are three components to ++how a task is next chosen. First is time_slice expiration. If a task runs out ++of its time_slice, it is descheduled, the time_slice is refilled, and the ++deadline reset to that formula above. Second is sleep, where a task no longer ++is requesting CPU for whatever reason. The time_slice and deadline are _not_ ++adjusted in this case and are just carried over for when the task is next ++scheduled. Third is preemption, and that is when a newly waking task is deemed ++higher priority than a currently running task on any cpu by virtue of the fact ++that it has an earlier virtual deadline than the currently running task. The ++earlier deadline is the key to which task is next chosen for the first and ++second cases. ++ ++The CPU proportion of different nice tasks works out to be approximately the ++ ++ (prio_ratio difference)^2 ++ ++The reason it is squared is that a task's deadline does not change while it is ++running unless it runs out of time_slice. Thus, even if the time actually ++passes the deadline of another task that is queued, it will not get CPU time ++unless the current running task deschedules, and the time "base" (niffies) is ++constantly moving. ++ ++Task lookup: ++ ++As tasks are already pre-ordered according to anticipated scheduling order in ++the skip lists, lookup for the next suitable task per-runqueue is always a ++matter of simply selecting the first task in the 0th level skip list entry. ++In order to maintain optimal latency and fairness across CPUs, MuQSS does a ++novel examination of every other runqueue in cache locality order, choosing the ++best task across all runqueues. This provides near-determinism of how long any ++task across the entire system may wait before receiving CPU time. The other ++runqueues are first examine lockless and then trylocked to minimise the ++potential lock contention if they are likely to have a suitable better task. ++Each other runqueue lock is only held for as long as it takes to examine the ++entry for suitability. In "interactive" mode, the default setting, MuQSS will ++look for the best deadline task across all CPUs, while in !interactive mode, ++it will only select a better deadline task from another CPU if it is more ++heavily laden than the current one. ++ ++Lookup is therefore O(k) where k is number of CPUs. ++ ++ ++Latency. ++ ++Through the use of virtual deadlines to govern the scheduling order of normal ++tasks, queue-to-activation latency per runqueue is guaranteed to be bound by ++the rr_interval tunable which is set to 6ms by default. This means that the ++longest a CPU bound task will wait for more CPU is proportional to the number ++of running tasks and in the common case of 0-2 running tasks per CPU, will be ++under the 7ms threshold for human perception of jitter. Additionally, as newly ++woken tasks will have an early deadline from their previous runtime, the very ++tasks that are usually latency sensitive will have the shortest interval for ++activation, usually preempting any existing CPU bound tasks. ++ ++Tickless expiry: ++ ++A feature of MuQSS is that it is not tied to the resolution of the chosen tick ++rate in Hz, instead depending entirely on the high resolution timers where ++possible for sub-millisecond accuracy on timeouts regarless of the underlying ++tick rate. This allows MuQSS to be run with the low overhead of low Hz rates ++such as 100 by default, benefiting from the improved throughput and lower ++power usage it provides. Another advantage of this approach is that in ++combination with the Full No HZ option, which disables ticks on running task ++CPUs instead of just idle CPUs, the tick can be disabled at all times ++regardless of how many tasks are running instead of being limited to just one ++running task. Note that this option is NOT recommended for regular desktop ++users. ++ ++ ++Scalability and balancing. ++ ++Unlike traditional approaches where balancing is a combination of CPU selection ++at task wakeup and intermittent balancing based on a vast array of rules set ++according to architecture, busyness calculations and special case management, ++MuQSS indirectly balances on the fly at task wakeup and next task selection. ++During initialisation, MuQSS creates a cache coherency ordered list of CPUs for ++each logical CPU and uses this to aid task/CPU selection when CPUs are busy. ++Additionally it selects any idle CPUs, if they are available, at any time over ++busy CPUs according to the following preference: ++ ++ * Same thread, idle or busy cache, idle or busy threads ++ * Other core, same cache, idle or busy cache, idle threads. ++ * Same node, other CPU, idle cache, idle threads. ++ * Same node, other CPU, busy cache, idle threads. ++ * Other core, same cache, busy threads. ++ * Same node, other CPU, busy threads. ++ * Other node, other CPU, idle cache, idle threads. ++ * Other node, other CPU, busy cache, idle threads. ++ * Other node, other CPU, busy threads. ++ ++Mux is therefore SMT, MC and Numa aware without the need for extra ++intermittent balancing to maintain CPUs busy and make the most of cache ++coherency. ++ ++ ++Features ++ ++As the initial prime target audience for MuQSS was the average desktop user, it ++was designed to not need tweaking, tuning or have features set to obtain benefit ++from it. Thus the number of knobs and features has been kept to an absolute ++minimum and should not require extra user input for the vast majority of cases. ++There are 3 optional tunables, and 2 extra scheduling policies. The rr_interval, ++interactive, and iso_cpu tunables, and the SCHED_ISO and SCHED_IDLEPRIO ++policies. In addition to this, MuQSS also uses sub-tick accounting. What MuQSS ++does _not_ now feature is support for CGROUPS. The average user should neither ++need to know what these are, nor should they need to be using them to have good ++desktop behaviour. However since some applications refuse to work without ++cgroups, one can enable them with MuQSS as a stub and the filesystem will be ++created which will allow the applications to work. ++ ++rr_interval: ++ ++ /proc/sys/kernel/rr_interval ++ ++The value is in milliseconds, and the default value is set to 6. Valid values ++are from 1 to 1000 Decreasing the value will decrease latencies at the cost of ++decreasing throughput, while increasing it will improve throughput, but at the ++cost of worsening latencies. It is based on the fact that humans can detect ++jitter at approximately 7ms, so aiming for much lower latencies is pointless ++under most circumstances. It is worth noting this fact when comparing the ++latency performance of MuQSS to other schedulers. Worst case latencies being ++higher than 7ms are far worse than average latencies not being in the ++microsecond range. ++ ++interactive: ++ ++ /proc/sys/kernel/interactive ++ ++The value is a simple boolean of 1 for on and 0 for off and is set to on by ++default. Disabling this will disable the near-determinism of MuQSS when ++selecting the next task by not examining all CPUs for the earliest deadline ++task, or which CPU to wake to, instead prioritising CPU balancing for improved ++throughput. Latency will still be bound by rr_interval, but on a per-CPU basis ++instead of across the whole system. ++ ++Runqueue sharing. ++ ++By default MuQSS chooses to share runqueue resources (specifically the skip ++list and locking) between multicore siblings. It is configurable at build time ++to select between None, SMT, MC and SMP, corresponding to no sharing, sharing ++only between simultaneous mulithreading siblings, multicore siblings, or ++symmetric multiprocessing physical packages. Additionally it can be se at ++bootime with the use of the rqshare parameter. The reason for configurability ++is that some architectures have CPUs with many multicore siblings (>= 16) ++where it may be detrimental to throughput to share runqueues and another ++sharing option may be desirable. Additionally, more sharing than usual can ++improve latency on a system-wide level at the expense of throughput if desired. ++ ++The options are: ++none, smt, mc, smp ++ ++eg: ++ rqshare=mc ++ ++Isochronous scheduling: ++ ++Isochronous scheduling is a unique scheduling policy designed to provide ++near-real-time performance to unprivileged (ie non-root) users without the ++ability to starve the machine indefinitely. Isochronous tasks (which means ++"same time") are set using, for example, the schedtool application like so: ++ ++ schedtool -I -e amarok ++ ++This will start the audio application "amarok" as SCHED_ISO. How SCHED_ISO works ++is that it has a priority level between true realtime tasks and SCHED_NORMAL ++which would allow them to preempt all normal tasks, in a SCHED_RR fashion (ie, ++if multiple SCHED_ISO tasks are running, they purely round robin at rr_interval ++rate). However if ISO tasks run for more than a tunable finite amount of time, ++they are then demoted back to SCHED_NORMAL scheduling. This finite amount of ++time is the percentage of CPU available per CPU, configurable as a percentage in ++the following "resource handling" tunable (as opposed to a scheduler tunable): ++ ++iso_cpu: ++ ++ /proc/sys/kernel/iso_cpu ++ ++and is set to 70% by default. It is calculated over a rolling 5 second average ++Because it is the total CPU available, it means that on a multi CPU machine, it ++is possible to have an ISO task running as realtime scheduling indefinitely on ++just one CPU, as the other CPUs will be available. Setting this to 100 is the ++equivalent of giving all users SCHED_RR access and setting it to 0 removes the ++ability to run any pseudo-realtime tasks. ++ ++A feature of MuQSS is that it detects when an application tries to obtain a ++realtime policy (SCHED_RR or SCHED_FIFO) and the caller does not have the ++appropriate privileges to use those policies. When it detects this, it will ++give the task SCHED_ISO policy instead. Thus it is transparent to the user. ++ ++ ++Idleprio scheduling: ++ ++Idleprio scheduling is a scheduling policy designed to give out CPU to a task ++_only_ when the CPU would be otherwise idle. The idea behind this is to allow ++ultra low priority tasks to be run in the background that have virtually no ++effect on the foreground tasks. This is ideally suited to distributed computing ++clients (like setiathome, folding, mprime etc) but can also be used to start a ++video encode or so on without any slowdown of other tasks. To avoid this policy ++from grabbing shared resources and holding them indefinitely, if it detects a ++state where the task is waiting on I/O, the machine is about to suspend to ram ++and so on, it will transiently schedule them as SCHED_NORMAL. Once a task has ++been scheduled as IDLEPRIO, it cannot be put back to SCHED_NORMAL without ++superuser privileges since it is effectively a lower scheduling policy. Tasks ++can be set to start as SCHED_IDLEPRIO with the schedtool command like so: ++ ++schedtool -D -e ./mprime ++ ++Subtick accounting: ++ ++It is surprisingly difficult to get accurate CPU accounting, and in many cases, ++the accounting is done by simply determining what is happening at the precise ++moment a timer tick fires off. This becomes increasingly inaccurate as the timer ++tick frequency (HZ) is lowered. It is possible to create an application which ++uses almost 100% CPU, yet by being descheduled at the right time, records zero ++CPU usage. While the main problem with this is that there are possible security ++implications, it is also difficult to determine how much CPU a task really does ++use. Mux uses sub-tick accounting from the TSC clock to determine real CPU ++usage. Thus, the amount of CPU reported as being used by MuQSS will more ++accurately represent how much CPU the task itself is using (as is shown for ++example by the 'time' application), so the reported values may be quite ++different to other schedulers. When comparing throughput of MuQSS to other ++designs, it is important to compare the actual completed work in terms of total ++wall clock time taken and total work done, rather than the reported "cpu usage". ++ ++Symmetric MultiThreading (SMT) aware nice: ++ ++SMT, a.k.a. hyperthreading, is a very common feature on modern CPUs. While the ++logical CPU count rises by adding thread units to each CPU core, allowing more ++than one task to be run simultaneously on the same core, the disadvantage of it ++is that the CPU power is shared between the tasks, not summating to the power ++of two CPUs. The practical upshot of this is that two tasks running on ++separate threads of the same core run significantly slower than if they had one ++core each to run on. While smart CPU selection allows each task to have a core ++to itself whenever available (as is done on MuQSS), it cannot offset the ++slowdown that occurs when the cores are all loaded and only a thread is left. ++Most of the time this is harmless as the CPU is effectively overloaded at this ++point and the extra thread is of benefit. However when running a niced task in ++the presence of an un-niced task (say nice 19 v nice 0), the nice task gets ++precisely the same amount of CPU power as the unniced one. MuQSS has an ++optional configuration feature known as SMT-NICE which selectively idles the ++secondary niced thread for a period proportional to the nice difference, ++allowing CPU distribution according to nice level to be maintained, at the ++expense of a small amount of extra overhead. If this is configured in on a ++machine without SMT threads, the overhead is minimal. ++ ++ ++Con Kolivas Sat, 29th October 2016 +diff --git a/Makefile b/Makefile +index d4d36c61940b..4a9dfe471f1f 100644 +--- a/Makefile ++++ b/Makefile +@@ -15,6 +15,10 @@ NAME = Kleptomaniac Octopus + PHONY := _all + _all: + ++CKVERSION = -ck1 ++CKNAME = MuQSS Powered ++EXTRAVERSION := $(EXTRAVERSION)$(CKVERSION) ++ + # We are using a recursive build, so we need to do a little thinking + # to get the ordering right. + # +diff --git a/arch/alpha/Kconfig b/arch/alpha/Kconfig +index ef179033a7c2..14b576a531ad 100644 +--- a/arch/alpha/Kconfig ++++ b/arch/alpha/Kconfig +@@ -665,6 +665,8 @@ config HZ + default 1200 if HZ_1200 + default 1024 + ++source "kernel/Kconfig.MuQSS" ++ + config SRM_ENV + tristate "SRM environment through procfs" + depends on PROC_FS +diff --git a/arch/arc/configs/tb10x_defconfig b/arch/arc/configs/tb10x_defconfig +index 3a138f8c7299..65f44e309a08 100644 +--- a/arch/arc/configs/tb10x_defconfig ++++ b/arch/arc/configs/tb10x_defconfig +@@ -30,7 +30,7 @@ CONFIG_ARC_PLAT_TB10X=y + CONFIG_ARC_CACHE_LINE_SHIFT=5 + CONFIG_HZ=250 + CONFIG_ARC_BUILTIN_DTB_NAME="abilis_tb100_dvk" +-CONFIG_PREEMPT_VOLUNTARY=y ++CONFIG_PREEMPT=y + # CONFIG_COMPACTION is not set + CONFIG_NET=y + CONFIG_PACKET=y +diff --git a/arch/arm/Kconfig b/arch/arm/Kconfig +index 8a50efb559f3..d8507d20c258 100644 +--- a/arch/arm/Kconfig ++++ b/arch/arm/Kconfig +@@ -1238,6 +1238,8 @@ config SCHED_SMT + MultiThreading at a cost of slightly increased overhead in some + places. If unsure say N here. + ++source "kernel/Kconfig.MuQSS" ++ + config HAVE_ARM_SCU + bool + help +diff --git a/arch/arm/configs/bcm2835_defconfig b/arch/arm/configs/bcm2835_defconfig +index 519ff58e67b3..b2a05b6f7d80 100644 +--- a/arch/arm/configs/bcm2835_defconfig ++++ b/arch/arm/configs/bcm2835_defconfig +@@ -29,7 +29,7 @@ CONFIG_MODULE_UNLOAD=y + CONFIG_ARCH_MULTI_V6=y + CONFIG_ARCH_BCM=y + CONFIG_ARCH_BCM2835=y +-CONFIG_PREEMPT_VOLUNTARY=y ++CONFIG_PREEMPT=y + CONFIG_AEABI=y + CONFIG_KSM=y + CONFIG_CLEANCACHE=y +diff --git a/arch/arm/configs/imx_v6_v7_defconfig b/arch/arm/configs/imx_v6_v7_defconfig +index 0f7381ee0c37..3d747237bfed 100644 +--- a/arch/arm/configs/imx_v6_v7_defconfig ++++ b/arch/arm/configs/imx_v6_v7_defconfig +@@ -45,6 +45,7 @@ CONFIG_PCI_MSI=y + CONFIG_PCI_IMX6=y + CONFIG_SMP=y + CONFIG_ARM_PSCI=y ++CONFIG_PREEMPT=y + CONFIG_HIGHMEM=y + CONFIG_FORCE_MAX_ZONEORDER=14 + CONFIG_CMDLINE="noinitrd console=ttymxc0,115200" +diff --git a/arch/arm/configs/mps2_defconfig b/arch/arm/configs/mps2_defconfig +index 1d923dbb9928..9c1931f1fafd 100644 +--- a/arch/arm/configs/mps2_defconfig ++++ b/arch/arm/configs/mps2_defconfig +@@ -18,7 +18,7 @@ CONFIG_ARCH_MPS2=y + CONFIG_SET_MEM_PARAM=y + CONFIG_DRAM_BASE=0x21000000 + CONFIG_DRAM_SIZE=0x1000000 +-CONFIG_PREEMPT_VOLUNTARY=y ++CONFIG_PREEMPT=y + # CONFIG_ATAGS is not set + CONFIG_ZBOOT_ROM_TEXT=0x0 + CONFIG_ZBOOT_ROM_BSS=0x0 +diff --git a/arch/arm/configs/mxs_defconfig b/arch/arm/configs/mxs_defconfig +index 2773899c21b3..870866aaa39d 100644 +--- a/arch/arm/configs/mxs_defconfig ++++ b/arch/arm/configs/mxs_defconfig +@@ -1,7 +1,7 @@ + CONFIG_SYSVIPC=y + CONFIG_NO_HZ=y + CONFIG_HIGH_RES_TIMERS=y +-CONFIG_PREEMPT_VOLUNTARY=y ++CONFIG_PREEMPT_VOLUNTARY=n + CONFIG_TASKSTATS=y + CONFIG_TASK_DELAY_ACCT=y + CONFIG_TASK_XACCT=y +@@ -27,6 +27,11 @@ CONFIG_MODVERSIONS=y + CONFIG_BLK_DEV_INTEGRITY=y + # CONFIG_IOSCHED_DEADLINE is not set + # CONFIG_IOSCHED_CFQ is not set ++# CONFIG_ARCH_MULTI_V7 is not set ++CONFIG_ARCH_MXS=y ++# CONFIG_ARM_THUMB is not set ++CONFIG_PREEMPT=y ++CONFIG_AEABI=y + CONFIG_NET=y + CONFIG_PACKET=y + CONFIG_UNIX=y +diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig +index 3f047afb982c..d35eae0a5c7d 100644 +--- a/arch/arm64/Kconfig ++++ b/arch/arm64/Kconfig +@@ -864,6 +864,8 @@ config SCHED_SMT + MultiThreading at a cost of slightly increased overhead in some + places. If unsure say N here. + ++source "kernel/Kconfig.MuQSS" ++ + config NR_CPUS + int "Maximum number of CPUs (2-4096)" + range 2 4096 +diff --git a/arch/blackfin/configs/BF518F-EZBRD_defconfig b/arch/blackfin/configs/BF518F-EZBRD_defconfig +new file mode 100644 +index 000000000000..39b91dfa55b5 +--- /dev/null ++++ b/arch/blackfin/configs/BF518F-EZBRD_defconfig +@@ -0,0 +1,121 @@ ++CONFIG_EXPERIMENTAL=y ++CONFIG_SYSVIPC=y ++CONFIG_IKCONFIG=y ++CONFIG_IKCONFIG_PROC=y ++CONFIG_LOG_BUF_SHIFT=14 ++CONFIG_BLK_DEV_INITRD=y ++CONFIG_EXPERT=y ++# CONFIG_ELF_CORE is not set ++# CONFIG_FUTEX is not set ++# CONFIG_SIGNALFD is not set ++# CONFIG_TIMERFD is not set ++# CONFIG_EVENTFD is not set ++# CONFIG_AIO is not set ++CONFIG_SLAB=y ++CONFIG_MMAP_ALLOW_UNINITIALIZED=y ++CONFIG_MODULES=y ++CONFIG_MODULE_UNLOAD=y ++# CONFIG_LBDAF is not set ++# CONFIG_BLK_DEV_BSG is not set ++# CONFIG_IOSCHED_DEADLINE is not set ++# CONFIG_IOSCHED_CFQ is not set ++CONFIG_PREEMPT=y ++CONFIG_BF518=y ++CONFIG_IRQ_TIMER0=12 ++# CONFIG_CYCLES_CLOCKSOURCE is not set ++# CONFIG_SCHEDULE_L1 is not set ++# CONFIG_MEMSET_L1 is not set ++# CONFIG_MEMCPY_L1 is not set ++# CONFIG_SYS_BFIN_SPINLOCK_L1 is not set ++CONFIG_NOMMU_INITIAL_TRIM_EXCESS=0 ++CONFIG_BFIN_GPTIMERS=m ++CONFIG_C_CDPRIO=y ++CONFIG_BANK_3=0x99B2 ++CONFIG_BINFMT_FLAT=y ++CONFIG_BINFMT_ZFLAT=y ++CONFIG_NET=y ++CONFIG_PACKET=y ++CONFIG_UNIX=y ++CONFIG_INET=y ++CONFIG_IP_PNP=y ++# CONFIG_INET_XFRM_MODE_TRANSPORT is not set ++# CONFIG_INET_XFRM_MODE_TUNNEL is not set ++# CONFIG_INET_XFRM_MODE_BEET is not set ++# CONFIG_INET_LRO is not set ++# CONFIG_INET_DIAG is not set ++# CONFIG_IPV6 is not set ++# CONFIG_WIRELESS is not set ++CONFIG_UEVENT_HELPER_PATH="/sbin/hotplug" ++# CONFIG_FW_LOADER is not set ++CONFIG_MTD=y ++CONFIG_MTD_BLOCK=y ++CONFIG_MTD_JEDECPROBE=m ++CONFIG_MTD_RAM=y ++CONFIG_MTD_ROM=m ++CONFIG_MTD_COMPLEX_MAPPINGS=y ++CONFIG_BLK_DEV_RAM=y ++CONFIG_NETDEVICES=y ++CONFIG_NET_BFIN=y ++CONFIG_BFIN_MAC=y ++# CONFIG_NET_VENDOR_BROADCOM is not set ++# CONFIG_NET_VENDOR_CHELSIO is not set ++# CONFIG_NET_VENDOR_INTEL is not set ++# CONFIG_NET_VENDOR_MARVELL is not set ++# CONFIG_NET_VENDOR_MICREL is not set ++# CONFIG_NET_VENDOR_MICROCHIP is not set ++# CONFIG_NET_VENDOR_NATSEMI is not set ++# CONFIG_NET_VENDOR_SEEQ is not set ++# CONFIG_NET_VENDOR_SMSC is not set ++# CONFIG_NET_VENDOR_STMICRO is not set ++# CONFIG_WLAN is not set ++# CONFIG_INPUT is not set ++# CONFIG_SERIO is not set ++# CONFIG_VT is not set ++# CONFIG_LEGACY_PTYS is not set ++CONFIG_BFIN_JTAG_COMM=m ++# CONFIG_DEVKMEM is not set ++CONFIG_SERIAL_BFIN=y ++CONFIG_SERIAL_BFIN_CONSOLE=y ++CONFIG_SERIAL_BFIN_UART0=y ++# CONFIG_HW_RANDOM is not set ++CONFIG_I2C=y ++CONFIG_I2C_CHARDEV=y ++CONFIG_I2C_BLACKFIN_TWI=y ++CONFIG_I2C_BLACKFIN_TWI_CLK_KHZ=100 ++CONFIG_SPI=y ++CONFIG_SPI_BFIN5XX=y ++CONFIG_GPIOLIB=y ++CONFIG_GPIO_SYSFS=y ++# CONFIG_HWMON is not set ++CONFIG_WATCHDOG=y ++CONFIG_BFIN_WDT=y ++# CONFIG_USB_SUPPORT is not set ++CONFIG_MMC=y ++CONFIG_SDH_BFIN=y ++CONFIG_SDH_BFIN_MISSING_CMD_PULLUP_WORKAROUND=y ++CONFIG_RTC_CLASS=y ++CONFIG_RTC_DRV_BFIN=y ++CONFIG_EXT2_FS=m ++# CONFIG_DNOTIFY is not set ++CONFIG_VFAT_FS=m ++CONFIG_NFS_FS=m ++CONFIG_NFS_V3=y ++CONFIG_NLS_CODEPAGE_437=m ++CONFIG_NLS_CODEPAGE_936=m ++CONFIG_NLS_ISO8859_1=m ++CONFIG_NLS_UTF8=m ++CONFIG_DEBUG_SHIRQ=y ++CONFIG_DETECT_HUNG_TASK=y ++CONFIG_DEBUG_INFO=y ++# CONFIG_FTRACE is not set ++CONFIG_DEBUG_MMRS=y ++CONFIG_DEBUG_HWERR=y ++CONFIG_EXACT_HWERR=y ++CONFIG_DEBUG_DOUBLEFAULT=y ++CONFIG_DEBUG_BFIN_HWTRACE_COMPRESSION_ONE=y ++CONFIG_EARLY_PRINTK=y ++CONFIG_CPLB_INFO=y ++CONFIG_BFIN_PSEUDODBG_INSNS=y ++CONFIG_CRYPTO=y ++# CONFIG_CRYPTO_ANSI_CPRNG is not set ++CONFIG_CRC_CCITT=m +diff --git a/arch/blackfin/configs/BF526-EZBRD_defconfig b/arch/blackfin/configs/BF526-EZBRD_defconfig +new file mode 100644 +index 000000000000..675cadb3a0c4 +--- /dev/null ++++ b/arch/blackfin/configs/BF526-EZBRD_defconfig +@@ -0,0 +1,158 @@ ++CONFIG_EXPERIMENTAL=y ++CONFIG_SYSVIPC=y ++CONFIG_IKCONFIG=y ++CONFIG_IKCONFIG_PROC=y ++CONFIG_LOG_BUF_SHIFT=14 ++CONFIG_BLK_DEV_INITRD=y ++CONFIG_EXPERT=y ++# CONFIG_ELF_CORE is not set ++# CONFIG_FUTEX is not set ++# CONFIG_SIGNALFD is not set ++# CONFIG_TIMERFD is not set ++# CONFIG_EVENTFD is not set ++# CONFIG_AIO is not set ++CONFIG_SLAB=y ++CONFIG_MMAP_ALLOW_UNINITIALIZED=y ++CONFIG_MODULES=y ++CONFIG_MODULE_UNLOAD=y ++# CONFIG_LBDAF is not set ++# CONFIG_BLK_DEV_BSG is not set ++# CONFIG_IOSCHED_DEADLINE is not set ++# CONFIG_IOSCHED_CFQ is not set ++CONFIG_PREEMPT=y ++CONFIG_BF526=y ++CONFIG_IRQ_TIMER0=12 ++CONFIG_BFIN526_EZBRD=y ++CONFIG_IRQ_USB_INT0=11 ++CONFIG_IRQ_USB_INT1=11 ++CONFIG_IRQ_USB_INT2=11 ++CONFIG_IRQ_USB_DMA=11 ++# CONFIG_CYCLES_CLOCKSOURCE is not set ++# CONFIG_SCHEDULE_L1 is not set ++# CONFIG_MEMSET_L1 is not set ++# CONFIG_MEMCPY_L1 is not set ++# CONFIG_SYS_BFIN_SPINLOCK_L1 is not set ++CONFIG_NOMMU_INITIAL_TRIM_EXCESS=0 ++CONFIG_BFIN_GPTIMERS=m ++CONFIG_C_CDPRIO=y ++CONFIG_BANK_3=0x99B2 ++CONFIG_BINFMT_FLAT=y ++CONFIG_BINFMT_ZFLAT=y ++CONFIG_NET=y ++CONFIG_PACKET=y ++CONFIG_UNIX=y ++CONFIG_INET=y ++CONFIG_IP_PNP=y ++# CONFIG_INET_XFRM_MODE_TRANSPORT is not set ++# CONFIG_INET_XFRM_MODE_TUNNEL is not set ++# CONFIG_INET_XFRM_MODE_BEET is not set ++# CONFIG_INET_LRO is not set ++# CONFIG_INET_DIAG is not set ++# CONFIG_IPV6 is not set ++# CONFIG_WIRELESS is not set ++CONFIG_UEVENT_HELPER_PATH="/sbin/hotplug" ++# CONFIG_FW_LOADER is not set ++CONFIG_MTD=y ++CONFIG_MTD_BLOCK=y ++CONFIG_MTD_CFI=y ++CONFIG_MTD_CFI_INTELEXT=y ++CONFIG_MTD_RAM=y ++CONFIG_MTD_ROM=m ++CONFIG_MTD_COMPLEX_MAPPINGS=y ++CONFIG_MTD_PHYSMAP=y ++CONFIG_MTD_M25P80=y ++CONFIG_MTD_NAND=m ++CONFIG_MTD_SPI_NOR=y ++CONFIG_BLK_DEV_RAM=y ++CONFIG_SCSI=y ++# CONFIG_SCSI_PROC_FS is not set ++CONFIG_BLK_DEV_SD=y ++CONFIG_BLK_DEV_SR=m ++# CONFIG_SCSI_LOWLEVEL is not set ++CONFIG_NETDEVICES=y ++CONFIG_NET_BFIN=y ++CONFIG_BFIN_MAC=y ++# CONFIG_NET_VENDOR_BROADCOM is not set ++# CONFIG_NET_VENDOR_CHELSIO is not set ++# CONFIG_NET_VENDOR_INTEL is not set ++# CONFIG_NET_VENDOR_MARVELL is not set ++# CONFIG_NET_VENDOR_MICREL is not set ++# CONFIG_NET_VENDOR_MICROCHIP is not set ++# CONFIG_NET_VENDOR_NATSEMI is not set ++# CONFIG_NET_VENDOR_SEEQ is not set ++# CONFIG_NET_VENDOR_SMSC is not set ++# CONFIG_NET_VENDOR_STMICRO is not set ++# CONFIG_WLAN is not set ++CONFIG_INPUT_FF_MEMLESS=m ++# CONFIG_INPUT_MOUSEDEV is not set ++# CONFIG_INPUT_KEYBOARD is not set ++# CONFIG_INPUT_MOUSE is not set ++CONFIG_INPUT_MISC=y ++# CONFIG_SERIO is not set ++# CONFIG_LEGACY_PTYS is not set ++CONFIG_BFIN_JTAG_COMM=m ++# CONFIG_DEVKMEM is not set ++CONFIG_SERIAL_BFIN=y ++CONFIG_SERIAL_BFIN_CONSOLE=y ++CONFIG_SERIAL_BFIN_UART1=y ++# CONFIG_HW_RANDOM is not set ++CONFIG_I2C=y ++CONFIG_I2C_CHARDEV=m ++CONFIG_I2C_BLACKFIN_TWI=y ++CONFIG_I2C_BLACKFIN_TWI_CLK_KHZ=100 ++CONFIG_SPI=y ++CONFIG_SPI_BFIN5XX=y ++CONFIG_GPIOLIB=y ++CONFIG_GPIO_SYSFS=y ++CONFIG_WATCHDOG=y ++CONFIG_BFIN_WDT=y ++CONFIG_HID_A4TECH=y ++CONFIG_HID_APPLE=y ++CONFIG_HID_BELKIN=y ++CONFIG_HID_CHERRY=y ++CONFIG_HID_CHICONY=y ++CONFIG_HID_CYPRESS=y ++CONFIG_HID_EZKEY=y ++CONFIG_HID_GYRATION=y ++CONFIG_HID_LOGITECH=y ++CONFIG_HID_MICROSOFT=y ++CONFIG_HID_MONTEREY=y ++CONFIG_HID_PANTHERLORD=y ++CONFIG_HID_PETALYNX=y ++CONFIG_HID_SAMSUNG=y ++CONFIG_HID_SONY=y ++CONFIG_HID_SUNPLUS=y ++CONFIG_USB=y ++# CONFIG_USB_DEVICE_CLASS is not set ++CONFIG_USB_OTG_BLACKLIST_HUB=y ++CONFIG_USB_MON=y ++CONFIG_USB_STORAGE=y ++CONFIG_RTC_CLASS=y ++CONFIG_RTC_DRV_BFIN=y ++CONFIG_EXT2_FS=m ++# CONFIG_DNOTIFY is not set ++CONFIG_ISO9660_FS=m ++CONFIG_JOLIET=y ++CONFIG_VFAT_FS=m ++CONFIG_JFFS2_FS=m ++CONFIG_NFS_FS=m ++CONFIG_NFS_V3=y ++CONFIG_NLS_CODEPAGE_437=m ++CONFIG_NLS_CODEPAGE_936=m ++CONFIG_NLS_ISO8859_1=m ++CONFIG_NLS_UTF8=m ++CONFIG_DEBUG_SHIRQ=y ++CONFIG_DETECT_HUNG_TASK=y ++CONFIG_DEBUG_INFO=y ++# CONFIG_FTRACE is not set ++CONFIG_DEBUG_MMRS=y ++CONFIG_DEBUG_HWERR=y ++CONFIG_EXACT_HWERR=y ++CONFIG_DEBUG_DOUBLEFAULT=y ++CONFIG_DEBUG_BFIN_HWTRACE_COMPRESSION_ONE=y ++CONFIG_EARLY_PRINTK=y ++CONFIG_CPLB_INFO=y ++CONFIG_BFIN_PSEUDODBG_INSNS=y ++CONFIG_CRYPTO=y ++# CONFIG_CRYPTO_ANSI_CPRNG is not set ++CONFIG_CRC_CCITT=m +diff --git a/arch/blackfin/configs/BF527-EZKIT-V2_defconfig b/arch/blackfin/configs/BF527-EZKIT-V2_defconfig +new file mode 100644 +index 000000000000..4c517c443af5 +--- /dev/null ++++ b/arch/blackfin/configs/BF527-EZKIT-V2_defconfig +@@ -0,0 +1,188 @@ ++CONFIG_EXPERIMENTAL=y ++CONFIG_SYSVIPC=y ++CONFIG_IKCONFIG=y ++CONFIG_IKCONFIG_PROC=y ++CONFIG_LOG_BUF_SHIFT=14 ++CONFIG_BLK_DEV_INITRD=y ++CONFIG_EXPERT=y ++# CONFIG_ELF_CORE is not set ++# CONFIG_FUTEX is not set ++# CONFIG_SIGNALFD is not set ++# CONFIG_TIMERFD is not set ++# CONFIG_EVENTFD is not set ++# CONFIG_AIO is not set ++CONFIG_SLAB=y ++CONFIG_MMAP_ALLOW_UNINITIALIZED=y ++CONFIG_MODULES=y ++CONFIG_MODULE_UNLOAD=y ++# CONFIG_LBDAF is not set ++# CONFIG_BLK_DEV_BSG is not set ++# CONFIG_IOSCHED_DEADLINE is not set ++# CONFIG_IOSCHED_CFQ is not set ++CONFIG_PREEMPT=y ++CONFIG_BF527=y ++CONFIG_BF_REV_0_2=y ++CONFIG_BFIN527_EZKIT_V2=y ++CONFIG_IRQ_USB_INT0=11 ++CONFIG_IRQ_USB_INT1=11 ++CONFIG_IRQ_USB_INT2=11 ++CONFIG_IRQ_USB_DMA=11 ++# CONFIG_CYCLES_CLOCKSOURCE is not set ++# CONFIG_SCHEDULE_L1 is not set ++# CONFIG_MEMSET_L1 is not set ++# CONFIG_MEMCPY_L1 is not set ++# CONFIG_SYS_BFIN_SPINLOCK_L1 is not set ++CONFIG_NOMMU_INITIAL_TRIM_EXCESS=0 ++CONFIG_C_CDPRIO=y ++CONFIG_BANK_3=0x99B2 ++CONFIG_BINFMT_FLAT=y ++CONFIG_BINFMT_ZFLAT=y ++CONFIG_NET=y ++CONFIG_PACKET=y ++CONFIG_UNIX=y ++CONFIG_INET=y ++CONFIG_IP_PNP=y ++# CONFIG_INET_XFRM_MODE_TRANSPORT is not set ++# CONFIG_INET_XFRM_MODE_TUNNEL is not set ++# CONFIG_INET_XFRM_MODE_BEET is not set ++# CONFIG_INET_LRO is not set ++# CONFIG_INET_DIAG is not set ++# CONFIG_IPV6 is not set ++CONFIG_IRDA=m ++CONFIG_IRLAN=m ++CONFIG_IRCOMM=m ++CONFIG_IRTTY_SIR=m ++CONFIG_BFIN_SIR=m ++CONFIG_BFIN_SIR0=y ++# CONFIG_WIRELESS is not set ++CONFIG_UEVENT_HELPER_PATH="/sbin/hotplug" ++# CONFIG_FW_LOADER is not set ++CONFIG_MTD=y ++CONFIG_MTD_BLOCK=y ++CONFIG_MTD_JEDECPROBE=m ++CONFIG_MTD_RAM=y ++CONFIG_MTD_ROM=m ++CONFIG_MTD_COMPLEX_MAPPINGS=y ++CONFIG_MTD_M25P80=y ++CONFIG_MTD_NAND=m ++CONFIG_MTD_SPI_NOR=y ++CONFIG_BLK_DEV_RAM=y ++CONFIG_SCSI=y ++# CONFIG_SCSI_PROC_FS is not set ++CONFIG_BLK_DEV_SD=y ++CONFIG_BLK_DEV_SR=m ++# CONFIG_SCSI_LOWLEVEL is not set ++CONFIG_NETDEVICES=y ++CONFIG_NET_BFIN=y ++CONFIG_BFIN_MAC=y ++# CONFIG_NET_VENDOR_BROADCOM is not set ++# CONFIG_NET_VENDOR_CHELSIO is not set ++# CONFIG_NET_VENDOR_INTEL is not set ++# CONFIG_NET_VENDOR_MARVELL is not set ++# CONFIG_NET_VENDOR_MICREL is not set ++# CONFIG_NET_VENDOR_MICROCHIP is not set ++# CONFIG_NET_VENDOR_NATSEMI is not set ++# CONFIG_NET_VENDOR_SEEQ is not set ++# CONFIG_NET_VENDOR_SMSC is not set ++# CONFIG_NET_VENDOR_STMICRO is not set ++# CONFIG_WLAN is not set ++CONFIG_INPUT_FF_MEMLESS=m ++# CONFIG_INPUT_MOUSEDEV is not set ++CONFIG_INPUT_EVDEV=y ++CONFIG_KEYBOARD_ADP5520=y ++# CONFIG_KEYBOARD_ATKBD is not set ++# CONFIG_INPUT_MOUSE is not set ++CONFIG_INPUT_TOUCHSCREEN=y ++CONFIG_TOUCHSCREEN_AD7879=y ++CONFIG_TOUCHSCREEN_AD7879_I2C=y ++CONFIG_INPUT_MISC=y ++# CONFIG_SERIO is not set ++# CONFIG_LEGACY_PTYS is not set ++CONFIG_BFIN_JTAG_COMM=m ++# CONFIG_DEVKMEM is not set ++CONFIG_SERIAL_BFIN=y ++CONFIG_SERIAL_BFIN_CONSOLE=y ++CONFIG_SERIAL_BFIN_UART1=y ++# CONFIG_HW_RANDOM is not set ++CONFIG_I2C=y ++CONFIG_I2C_CHARDEV=m ++CONFIG_I2C_BLACKFIN_TWI=y ++CONFIG_I2C_BLACKFIN_TWI_CLK_KHZ=100 ++CONFIG_SPI=y ++CONFIG_SPI_BFIN5XX=y ++CONFIG_GPIOLIB=y ++CONFIG_GPIO_SYSFS=y ++# CONFIG_HWMON is not set ++CONFIG_WATCHDOG=y ++CONFIG_BFIN_WDT=y ++CONFIG_PMIC_ADP5520=y ++CONFIG_FB=y ++CONFIG_FB_BFIN_LQ035Q1=y ++CONFIG_BACKLIGHT_LCD_SUPPORT=y ++CONFIG_FRAMEBUFFER_CONSOLE=y ++CONFIG_LOGO=y ++# CONFIG_LOGO_LINUX_MONO is not set ++# CONFIG_LOGO_LINUX_VGA16 is not set ++# CONFIG_LOGO_LINUX_CLUT224 is not set ++# CONFIG_LOGO_BLACKFIN_VGA16 is not set ++CONFIG_SOUND=y ++CONFIG_SND=y ++CONFIG_SND_SOC=y ++CONFIG_SND_BF5XX_I2S=y ++CONFIG_SND_BF5XX_SOC_SSM2602=y ++CONFIG_HID_A4TECH=y ++CONFIG_HID_APPLE=y ++CONFIG_HID_BELKIN=y ++CONFIG_HID_CHERRY=y ++CONFIG_HID_CHICONY=y ++CONFIG_HID_CYPRESS=y ++CONFIG_HID_EZKEY=y ++CONFIG_HID_GYRATION=y ++CONFIG_HID_LOGITECH=y ++CONFIG_HID_MICROSOFT=y ++CONFIG_HID_MONTEREY=y ++CONFIG_HID_PANTHERLORD=y ++CONFIG_HID_PETALYNX=y ++CONFIG_HID_SAMSUNG=y ++CONFIG_HID_SONY=y ++CONFIG_HID_SUNPLUS=y ++CONFIG_USB=y ++# CONFIG_USB_DEVICE_CLASS is not set ++CONFIG_USB_OTG_BLACKLIST_HUB=y ++CONFIG_USB_MON=y ++CONFIG_USB_MUSB_HDRC=y ++CONFIG_USB_MUSB_BLACKFIN=y ++CONFIG_USB_STORAGE=y ++CONFIG_USB_GADGET=y ++CONFIG_NEW_LEDS=y ++CONFIG_LEDS_CLASS=y ++CONFIG_LEDS_ADP5520=y ++CONFIG_RTC_CLASS=y ++CONFIG_RTC_DRV_BFIN=y ++CONFIG_EXT2_FS=m ++# CONFIG_DNOTIFY is not set ++CONFIG_ISO9660_FS=m ++CONFIG_JOLIET=y ++CONFIG_UDF_FS=m ++CONFIG_VFAT_FS=m ++CONFIG_JFFS2_FS=m ++CONFIG_NFS_FS=m ++CONFIG_NFS_V3=y ++CONFIG_NLS_CODEPAGE_437=m ++CONFIG_NLS_CODEPAGE_936=m ++CONFIG_NLS_ISO8859_1=m ++CONFIG_NLS_UTF8=m ++CONFIG_DEBUG_SHIRQ=y ++CONFIG_DETECT_HUNG_TASK=y ++CONFIG_DEBUG_INFO=y ++# CONFIG_FTRACE is not set ++CONFIG_DEBUG_MMRS=y ++CONFIG_DEBUG_HWERR=y ++CONFIG_EXACT_HWERR=y ++CONFIG_DEBUG_DOUBLEFAULT=y ++CONFIG_DEBUG_BFIN_HWTRACE_COMPRESSION_ONE=y ++CONFIG_EARLY_PRINTK=y ++CONFIG_CPLB_INFO=y ++CONFIG_BFIN_PSEUDODBG_INSNS=y ++CONFIG_CRYPTO=y ++# CONFIG_CRYPTO_ANSI_CPRNG is not set +diff --git a/arch/blackfin/configs/BF527-EZKIT_defconfig b/arch/blackfin/configs/BF527-EZKIT_defconfig +new file mode 100644 +index 000000000000..bf8df3e6cf02 +--- /dev/null ++++ b/arch/blackfin/configs/BF527-EZKIT_defconfig +@@ -0,0 +1,181 @@ ++CONFIG_EXPERIMENTAL=y ++CONFIG_SYSVIPC=y ++CONFIG_IKCONFIG=y ++CONFIG_IKCONFIG_PROC=y ++CONFIG_LOG_BUF_SHIFT=14 ++CONFIG_BLK_DEV_INITRD=y ++CONFIG_EXPERT=y ++# CONFIG_ELF_CORE is not set ++# CONFIG_FUTEX is not set ++# CONFIG_SIGNALFD is not set ++# CONFIG_TIMERFD is not set ++# CONFIG_EVENTFD is not set ++# CONFIG_AIO is not set ++CONFIG_SLAB=y ++CONFIG_MMAP_ALLOW_UNINITIALIZED=y ++CONFIG_MODULES=y ++CONFIG_MODULE_UNLOAD=y ++# CONFIG_LBDAF is not set ++# CONFIG_BLK_DEV_BSG is not set ++# CONFIG_IOSCHED_DEADLINE is not set ++# CONFIG_IOSCHED_CFQ is not set ++CONFIG_PREEMPT=y ++CONFIG_BF527=y ++CONFIG_BF_REV_0_1=y ++CONFIG_IRQ_USB_INT0=11 ++CONFIG_IRQ_USB_INT1=11 ++CONFIG_IRQ_USB_INT2=11 ++CONFIG_IRQ_USB_DMA=11 ++# CONFIG_CYCLES_CLOCKSOURCE is not set ++# CONFIG_SCHEDULE_L1 is not set ++# CONFIG_MEMSET_L1 is not set ++# CONFIG_MEMCPY_L1 is not set ++# CONFIG_SYS_BFIN_SPINLOCK_L1 is not set ++CONFIG_NOMMU_INITIAL_TRIM_EXCESS=0 ++CONFIG_C_CDPRIO=y ++CONFIG_BANK_3=0x99B2 ++CONFIG_BINFMT_FLAT=y ++CONFIG_BINFMT_ZFLAT=y ++CONFIG_NET=y ++CONFIG_PACKET=y ++CONFIG_UNIX=y ++CONFIG_INET=y ++CONFIG_IP_PNP=y ++# CONFIG_INET_XFRM_MODE_TRANSPORT is not set ++# CONFIG_INET_XFRM_MODE_TUNNEL is not set ++# CONFIG_INET_XFRM_MODE_BEET is not set ++# CONFIG_INET_LRO is not set ++# CONFIG_INET_DIAG is not set ++# CONFIG_IPV6 is not set ++CONFIG_IRDA=m ++CONFIG_IRLAN=m ++CONFIG_IRCOMM=m ++CONFIG_IRTTY_SIR=m ++CONFIG_BFIN_SIR=m ++CONFIG_BFIN_SIR0=y ++# CONFIG_WIRELESS is not set ++CONFIG_UEVENT_HELPER_PATH="/sbin/hotplug" ++# CONFIG_FW_LOADER is not set ++CONFIG_MTD=y ++CONFIG_MTD_BLOCK=y ++CONFIG_MTD_JEDECPROBE=m ++CONFIG_MTD_RAM=y ++CONFIG_MTD_ROM=m ++CONFIG_MTD_COMPLEX_MAPPINGS=y ++CONFIG_MTD_M25P80=y ++CONFIG_MTD_NAND=m ++CONFIG_MTD_SPI_NOR=y ++CONFIG_BLK_DEV_RAM=y ++CONFIG_SCSI=y ++# CONFIG_SCSI_PROC_FS is not set ++CONFIG_BLK_DEV_SD=y ++CONFIG_BLK_DEV_SR=m ++# CONFIG_SCSI_LOWLEVEL is not set ++CONFIG_NETDEVICES=y ++CONFIG_NET_BFIN=y ++CONFIG_BFIN_MAC=y ++# CONFIG_NET_VENDOR_BROADCOM is not set ++# CONFIG_NET_VENDOR_CHELSIO is not set ++# CONFIG_NET_VENDOR_INTEL is not set ++# CONFIG_NET_VENDOR_MARVELL is not set ++# CONFIG_NET_VENDOR_MICREL is not set ++# CONFIG_NET_VENDOR_MICROCHIP is not set ++# CONFIG_NET_VENDOR_NATSEMI is not set ++# CONFIG_NET_VENDOR_SEEQ is not set ++# CONFIG_NET_VENDOR_SMSC is not set ++# CONFIG_NET_VENDOR_STMICRO is not set ++# CONFIG_WLAN is not set ++CONFIG_INPUT_FF_MEMLESS=m ++# CONFIG_INPUT_MOUSEDEV is not set ++# CONFIG_INPUT_KEYBOARD is not set ++# CONFIG_INPUT_MOUSE is not set ++CONFIG_INPUT_MISC=y ++# CONFIG_SERIO is not set ++# CONFIG_LEGACY_PTYS is not set ++CONFIG_BFIN_JTAG_COMM=m ++# CONFIG_DEVKMEM is not set ++CONFIG_SERIAL_BFIN=y ++CONFIG_SERIAL_BFIN_CONSOLE=y ++CONFIG_SERIAL_BFIN_UART1=y ++# CONFIG_HW_RANDOM is not set ++CONFIG_I2C=y ++CONFIG_I2C_CHARDEV=m ++CONFIG_I2C_BLACKFIN_TWI=y ++CONFIG_I2C_BLACKFIN_TWI_CLK_KHZ=100 ++CONFIG_SPI=y ++CONFIG_SPI_BFIN5XX=y ++CONFIG_GPIOLIB=y ++CONFIG_GPIO_SYSFS=y ++# CONFIG_HWMON is not set ++CONFIG_WATCHDOG=y ++CONFIG_BFIN_WDT=y ++CONFIG_FB=y ++CONFIG_FB_BFIN_T350MCQB=y ++CONFIG_BACKLIGHT_LCD_SUPPORT=y ++CONFIG_LCD_LTV350QV=m ++CONFIG_FRAMEBUFFER_CONSOLE=y ++CONFIG_LOGO=y ++# CONFIG_LOGO_LINUX_MONO is not set ++# CONFIG_LOGO_LINUX_VGA16 is not set ++# CONFIG_LOGO_LINUX_CLUT224 is not set ++# CONFIG_LOGO_BLACKFIN_VGA16 is not set ++CONFIG_SOUND=y ++CONFIG_SND=y ++CONFIG_SND_SOC=y ++CONFIG_SND_BF5XX_I2S=y ++CONFIG_SND_BF5XX_SOC_SSM2602=y ++CONFIG_HID_A4TECH=y ++CONFIG_HID_APPLE=y ++CONFIG_HID_BELKIN=y ++CONFIG_HID_CHERRY=y ++CONFIG_HID_CHICONY=y ++CONFIG_HID_CYPRESS=y ++CONFIG_HID_EZKEY=y ++CONFIG_HID_GYRATION=y ++CONFIG_HID_LOGITECH=y ++CONFIG_HID_MICROSOFT=y ++CONFIG_HID_MONTEREY=y ++CONFIG_HID_PANTHERLORD=y ++CONFIG_HID_PETALYNX=y ++CONFIG_HID_SAMSUNG=y ++CONFIG_HID_SONY=y ++CONFIG_HID_SUNPLUS=y ++CONFIG_USB=y ++# CONFIG_USB_DEVICE_CLASS is not set ++CONFIG_USB_OTG_BLACKLIST_HUB=y ++CONFIG_USB_MON=y ++CONFIG_USB_MUSB_HDRC=y ++CONFIG_MUSB_PIO_ONLY=y ++CONFIG_USB_MUSB_BLACKFIN=y ++CONFIG_MUSB_PIO_ONLY=y ++CONFIG_USB_STORAGE=y ++CONFIG_USB_GADGET=y ++CONFIG_RTC_CLASS=y ++CONFIG_RTC_DRV_BFIN=y ++CONFIG_EXT2_FS=m ++# CONFIG_DNOTIFY is not set ++CONFIG_ISO9660_FS=m ++CONFIG_JOLIET=y ++CONFIG_UDF_FS=m ++CONFIG_VFAT_FS=m ++CONFIG_JFFS2_FS=m ++CONFIG_NFS_FS=m ++CONFIG_NFS_V3=y ++CONFIG_NLS_CODEPAGE_437=m ++CONFIG_NLS_CODEPAGE_936=m ++CONFIG_NLS_ISO8859_1=m ++CONFIG_NLS_UTF8=m ++CONFIG_DEBUG_SHIRQ=y ++CONFIG_DETECT_HUNG_TASK=y ++CONFIG_DEBUG_INFO=y ++# CONFIG_FTRACE is not set ++CONFIG_DEBUG_MMRS=y ++CONFIG_DEBUG_HWERR=y ++CONFIG_EXACT_HWERR=y ++CONFIG_DEBUG_DOUBLEFAULT=y ++CONFIG_DEBUG_BFIN_HWTRACE_COMPRESSION_ONE=y ++CONFIG_EARLY_PRINTK=y ++CONFIG_CPLB_INFO=y ++CONFIG_BFIN_PSEUDODBG_INSNS=y ++CONFIG_CRYPTO=y ++# CONFIG_CRYPTO_ANSI_CPRNG is not set +diff --git a/arch/blackfin/configs/BF527-TLL6527M_defconfig b/arch/blackfin/configs/BF527-TLL6527M_defconfig +new file mode 100644 +index 000000000000..0220b3b15c53 +--- /dev/null ++++ b/arch/blackfin/configs/BF527-TLL6527M_defconfig +@@ -0,0 +1,178 @@ ++CONFIG_EXPERIMENTAL=y ++CONFIG_LOCALVERSION="DEV_0-1_pre2010" ++CONFIG_SYSVIPC=y ++CONFIG_IKCONFIG=y ++CONFIG_IKCONFIG_PROC=y ++CONFIG_LOG_BUF_SHIFT=14 ++CONFIG_BLK_DEV_INITRD=y ++# CONFIG_CC_OPTIMIZE_FOR_SIZE is not set ++CONFIG_EXPERT=y ++# CONFIG_SYSCTL_SYSCALL is not set ++# CONFIG_ELF_CORE is not set ++# CONFIG_FUTEX is not set ++# CONFIG_SIGNALFD is not set ++# CONFIG_TIMERFD is not set ++# CONFIG_EVENTFD is not set ++# CONFIG_AIO is not set ++CONFIG_SLAB=y ++CONFIG_MMAP_ALLOW_UNINITIALIZED=y ++CONFIG_MODULES=y ++CONFIG_MODULE_UNLOAD=y ++# CONFIG_LBDAF is not set ++# CONFIG_BLK_DEV_BSG is not set ++# CONFIG_IOSCHED_DEADLINE is not set ++CONFIG_PREEMPT=y ++CONFIG_BF527=y ++CONFIG_BF_REV_0_2=y ++CONFIG_BFIN527_TLL6527M=y ++CONFIG_BF527_UART1_PORTG=y ++CONFIG_IRQ_USB_INT0=11 ++CONFIG_IRQ_USB_INT1=11 ++CONFIG_IRQ_USB_INT2=11 ++CONFIG_IRQ_USB_DMA=11 ++CONFIG_BOOT_LOAD=0x400000 ++# CONFIG_CYCLES_CLOCKSOURCE is not set ++# CONFIG_SCHEDULE_L1 is not set ++# CONFIG_MEMSET_L1 is not set ++# CONFIG_MEMCPY_L1 is not set ++# CONFIG_SYS_BFIN_SPINLOCK_L1 is not set ++CONFIG_NOMMU_INITIAL_TRIM_EXCESS=0 ++CONFIG_BFIN_GPTIMERS=y ++CONFIG_DMA_UNCACHED_2M=y ++CONFIG_C_CDPRIO=y ++CONFIG_BANK_0=0xFFC2 ++CONFIG_BANK_1=0xFFC2 ++CONFIG_BANK_2=0xFFC2 ++CONFIG_BANK_3=0xFFC2 ++CONFIG_BINFMT_FLAT=y ++CONFIG_BINFMT_ZFLAT=y ++CONFIG_NET=y ++CONFIG_PACKET=y ++CONFIG_UNIX=y ++CONFIG_INET=y ++CONFIG_IP_PNP=y ++# CONFIG_INET_XFRM_MODE_TRANSPORT is not set ++# CONFIG_INET_XFRM_MODE_TUNNEL is not set ++# CONFIG_INET_XFRM_MODE_BEET is not set ++# CONFIG_INET_LRO is not set ++# CONFIG_INET_DIAG is not set ++# CONFIG_IPV6 is not set ++CONFIG_IRDA=m ++CONFIG_IRLAN=m ++CONFIG_IRCOMM=m ++CONFIG_IRTTY_SIR=m ++CONFIG_BFIN_SIR=m ++CONFIG_BFIN_SIR0=y ++# CONFIG_WIRELESS is not set ++CONFIG_UEVENT_HELPER_PATH="/sbin/hotplug" ++# CONFIG_FW_LOADER is not set ++CONFIG_MTD=y ++CONFIG_MTD_BLOCK=y ++CONFIG_MTD_CFI=y ++CONFIG_MTD_CFI_INTELEXT=y ++CONFIG_MTD_RAM=y ++CONFIG_MTD_ROM=y ++CONFIG_MTD_COMPLEX_MAPPINGS=y ++CONFIG_MTD_GPIO_ADDR=y ++CONFIG_BLK_DEV_RAM=y ++CONFIG_SCSI=y ++# CONFIG_SCSI_PROC_FS is not set ++CONFIG_BLK_DEV_SD=y ++CONFIG_BLK_DEV_SR=m ++# CONFIG_SCSI_LOWLEVEL is not set ++CONFIG_NETDEVICES=y ++CONFIG_NET_ETHERNET=y ++CONFIG_BFIN_MAC=y ++# CONFIG_NETDEV_1000 is not set ++# CONFIG_NETDEV_10000 is not set ++# CONFIG_WLAN is not set ++# CONFIG_INPUT_MOUSEDEV is not set ++CONFIG_INPUT_EVDEV=y ++# CONFIG_INPUT_KEYBOARD is not set ++# CONFIG_INPUT_MOUSE is not set ++CONFIG_INPUT_TOUCHSCREEN=y ++CONFIG_TOUCHSCREEN_AD7879=m ++CONFIG_INPUT_MISC=y ++CONFIG_INPUT_AD714X=y ++CONFIG_INPUT_ADXL34X=y ++# CONFIG_SERIO is not set ++CONFIG_BFIN_PPI=m ++CONFIG_BFIN_SIMPLE_TIMER=m ++CONFIG_BFIN_SPORT=m ++# CONFIG_CONSOLE_TRANSLATIONS is not set ++# CONFIG_DEVKMEM is not set ++CONFIG_BFIN_JTAG_COMM=m ++CONFIG_SERIAL_BFIN=y ++CONFIG_SERIAL_BFIN_CONSOLE=y ++CONFIG_SERIAL_BFIN_UART1=y ++# CONFIG_LEGACY_PTYS is not set ++# CONFIG_HW_RANDOM is not set ++CONFIG_I2C_CHARDEV=y ++# CONFIG_I2C_HELPER_AUTO is not set ++CONFIG_I2C_SMBUS=y ++CONFIG_I2C_BLACKFIN_TWI=y ++CONFIG_I2C_BLACKFIN_TWI_CLK_KHZ=100 ++CONFIG_GPIOLIB=y ++CONFIG_GPIO_SYSFS=y ++# CONFIG_HWMON is not set ++CONFIG_WATCHDOG=y ++CONFIG_BFIN_WDT=y ++CONFIG_MEDIA_SUPPORT=y ++CONFIG_VIDEO_DEV=y ++# CONFIG_MEDIA_TUNER_CUSTOMISE is not set ++CONFIG_VIDEO_HELPER_CHIPS_AUTO=y ++CONFIG_VIDEO_BLACKFIN_CAM=m ++CONFIG_OV9655=y ++CONFIG_FB=y ++CONFIG_BACKLIGHT_LCD_SUPPORT=y ++CONFIG_FRAMEBUFFER_CONSOLE=y ++CONFIG_FONTS=y ++CONFIG_FONT_6x11=y ++CONFIG_LOGO=y ++# CONFIG_LOGO_LINUX_MONO is not set ++# CONFIG_LOGO_LINUX_VGA16 is not set ++# CONFIG_LOGO_LINUX_CLUT224 is not set ++# CONFIG_LOGO_BLACKFIN_VGA16 is not set ++CONFIG_SOUND=y ++CONFIG_SND=y ++CONFIG_SND_MIXER_OSS=y ++CONFIG_SND_PCM_OSS=y ++CONFIG_SND_SOC=y ++CONFIG_SND_BF5XX_I2S=y ++CONFIG_SND_BF5XX_SOC_SSM2602=y ++# CONFIG_HID_SUPPORT is not set ++# CONFIG_USB_SUPPORT is not set ++CONFIG_MMC=m ++CONFIG_RTC_CLASS=y ++CONFIG_RTC_DRV_BFIN=y ++CONFIG_EXT2_FS=y ++# CONFIG_DNOTIFY is not set ++CONFIG_ISO9660_FS=m ++CONFIG_JOLIET=y ++CONFIG_UDF_FS=m ++CONFIG_MSDOS_FS=y ++CONFIG_VFAT_FS=y ++CONFIG_JFFS2_FS=y ++CONFIG_NFS_FS=m ++CONFIG_NFS_V3=y ++# CONFIG_RPCSEC_GSS_KRB5 is not set ++CONFIG_NLS_CODEPAGE_437=m ++CONFIG_NLS_CODEPAGE_936=m ++CONFIG_NLS_ISO8859_1=m ++CONFIG_NLS_UTF8=m ++CONFIG_DEBUG_KERNEL=y ++CONFIG_DEBUG_SHIRQ=y ++CONFIG_DETECT_HUNG_TASK=y ++CONFIG_DEBUG_INFO=y ++# CONFIG_RCU_CPU_STALL_DETECTOR is not set ++# CONFIG_FTRACE is not set ++CONFIG_DEBUG_MMRS=y ++CONFIG_DEBUG_HWERR=y ++CONFIG_EXACT_HWERR=y ++CONFIG_DEBUG_DOUBLEFAULT=y ++CONFIG_DEBUG_BFIN_HWTRACE_COMPRESSION_ONE=y ++CONFIG_EARLY_PRINTK=y ++CONFIG_CPLB_INFO=y ++CONFIG_CRYPTO=y ++# CONFIG_CRYPTO_ANSI_CPRNG is not set ++CONFIG_CRC7=m +diff --git a/arch/blackfin/configs/BF533-EZKIT_defconfig b/arch/blackfin/configs/BF533-EZKIT_defconfig +new file mode 100644 +index 000000000000..6023e3fd2c48 +--- /dev/null ++++ b/arch/blackfin/configs/BF533-EZKIT_defconfig +@@ -0,0 +1,114 @@ ++CONFIG_EXPERIMENTAL=y ++CONFIG_SYSVIPC=y ++CONFIG_IKCONFIG=y ++CONFIG_IKCONFIG_PROC=y ++CONFIG_LOG_BUF_SHIFT=14 ++CONFIG_BLK_DEV_INITRD=y ++CONFIG_EXPERT=y ++# CONFIG_ELF_CORE is not set ++# CONFIG_FUTEX is not set ++# CONFIG_SIGNALFD is not set ++# CONFIG_TIMERFD is not set ++# CONFIG_EVENTFD is not set ++# CONFIG_AIO is not set ++CONFIG_SLAB=y ++CONFIG_MMAP_ALLOW_UNINITIALIZED=y ++CONFIG_MODULES=y ++CONFIG_MODULE_UNLOAD=y ++# CONFIG_LBDAF is not set ++# CONFIG_BLK_DEV_BSG is not set ++# CONFIG_IOSCHED_DEADLINE is not set ++# CONFIG_IOSCHED_CFQ is not set ++CONFIG_PREEMPT=y ++CONFIG_BFIN533_EZKIT=y ++CONFIG_TIMER0=11 ++CONFIG_CLKIN_HZ=27000000 ++CONFIG_HIGH_RES_TIMERS=y ++CONFIG_NOMMU_INITIAL_TRIM_EXCESS=0 ++CONFIG_BFIN_GPTIMERS=m ++CONFIG_C_CDPRIO=y ++CONFIG_BANK_3=0xAAC2 ++CONFIG_BINFMT_FLAT=y ++CONFIG_BINFMT_ZFLAT=y ++CONFIG_NET=y ++CONFIG_PACKET=y ++CONFIG_UNIX=y ++CONFIG_INET=y ++CONFIG_IP_PNP=y ++# CONFIG_INET_XFRM_MODE_TRANSPORT is not set ++# CONFIG_INET_XFRM_MODE_TUNNEL is not set ++# CONFIG_INET_XFRM_MODE_BEET is not set ++# CONFIG_INET_LRO is not set ++# CONFIG_INET_DIAG is not set ++# CONFIG_IPV6 is not set ++CONFIG_IRDA=m ++CONFIG_IRLAN=m ++CONFIG_IRCOMM=m ++CONFIG_IRDA_CACHE_LAST_LSAP=y ++CONFIG_IRTTY_SIR=m ++# CONFIG_WIRELESS is not set ++CONFIG_UEVENT_HELPER_PATH="/sbin/hotplug" ++# CONFIG_FW_LOADER is not set ++CONFIG_MTD=y ++CONFIG_MTD_BLOCK=y ++CONFIG_MTD_JEDECPROBE=y ++CONFIG_MTD_CFI_AMDSTD=y ++CONFIG_MTD_RAM=y ++CONFIG_MTD_ROM=y ++CONFIG_MTD_COMPLEX_MAPPINGS=y ++CONFIG_MTD_PHYSMAP=y ++CONFIG_MTD_PLATRAM=y ++CONFIG_BLK_DEV_RAM=y ++CONFIG_NETDEVICES=y ++# CONFIG_NET_VENDOR_BROADCOM is not set ++# CONFIG_NET_VENDOR_CHELSIO is not set ++# CONFIG_NET_VENDOR_INTEL is not set ++# CONFIG_NET_VENDOR_MARVELL is not set ++# CONFIG_NET_VENDOR_MICREL is not set ++# CONFIG_NET_VENDOR_MICROCHIP is not set ++# CONFIG_NET_VENDOR_NATSEMI is not set ++# CONFIG_NET_VENDOR_SEEQ is not set ++CONFIG_SMC91X=y ++# CONFIG_NET_VENDOR_STMICRO is not set ++# CONFIG_WLAN is not set ++CONFIG_INPUT=m ++# CONFIG_INPUT_MOUSEDEV is not set ++CONFIG_INPUT_EVDEV=m ++# CONFIG_INPUT_KEYBOARD is not set ++# CONFIG_INPUT_MOUSE is not set ++# CONFIG_SERIO is not set ++# CONFIG_VT is not set ++# CONFIG_LEGACY_PTYS is not set ++CONFIG_BFIN_JTAG_COMM=m ++# CONFIG_DEVKMEM is not set ++CONFIG_SERIAL_BFIN=y ++CONFIG_SERIAL_BFIN_CONSOLE=y ++# CONFIG_HW_RANDOM is not set ++CONFIG_SPI=y ++CONFIG_SPI_BFIN5XX=y ++CONFIG_GPIOLIB=y ++CONFIG_GPIO_SYSFS=y ++# CONFIG_HWMON is not set ++CONFIG_WATCHDOG=y ++CONFIG_BFIN_WDT=y ++# CONFIG_USB_SUPPORT is not set ++CONFIG_RTC_CLASS=y ++CONFIG_RTC_DRV_BFIN=y ++# CONFIG_DNOTIFY is not set ++CONFIG_JFFS2_FS=m ++CONFIG_NFS_FS=m ++CONFIG_NFS_V3=y ++CONFIG_DEBUG_SHIRQ=y ++CONFIG_DETECT_HUNG_TASK=y ++CONFIG_DEBUG_INFO=y ++# CONFIG_FTRACE is not set ++CONFIG_DEBUG_MMRS=y ++CONFIG_DEBUG_HWERR=y ++CONFIG_EXACT_HWERR=y ++CONFIG_DEBUG_DOUBLEFAULT=y ++CONFIG_DEBUG_BFIN_HWTRACE_COMPRESSION_ONE=y ++CONFIG_EARLY_PRINTK=y ++CONFIG_CPLB_INFO=y ++CONFIG_BFIN_PSEUDODBG_INSNS=y ++CONFIG_CRYPTO=y ++# CONFIG_CRYPTO_ANSI_CPRNG is not set +diff --git a/arch/blackfin/configs/BF533-STAMP_defconfig b/arch/blackfin/configs/BF533-STAMP_defconfig +new file mode 100644 +index 000000000000..f5cd0f18b711 +--- /dev/null ++++ b/arch/blackfin/configs/BF533-STAMP_defconfig +@@ -0,0 +1,124 @@ ++CONFIG_EXPERIMENTAL=y ++CONFIG_SYSVIPC=y ++CONFIG_IKCONFIG=y ++CONFIG_IKCONFIG_PROC=y ++CONFIG_LOG_BUF_SHIFT=14 ++CONFIG_BLK_DEV_INITRD=y ++CONFIG_EXPERT=y ++# CONFIG_ELF_CORE is not set ++# CONFIG_FUTEX is not set ++# CONFIG_SIGNALFD is not set ++# CONFIG_TIMERFD is not set ++# CONFIG_EVENTFD is not set ++# CONFIG_AIO is not set ++CONFIG_SLAB=y ++CONFIG_MMAP_ALLOW_UNINITIALIZED=y ++CONFIG_MODULES=y ++CONFIG_MODULE_UNLOAD=y ++# CONFIG_LBDAF is not set ++# CONFIG_BLK_DEV_BSG is not set ++# CONFIG_IOSCHED_DEADLINE is not set ++# CONFIG_IOSCHED_CFQ is not set ++CONFIG_PREEMPT=y ++CONFIG_TIMER0=11 ++CONFIG_HIGH_RES_TIMERS=y ++CONFIG_NOMMU_INITIAL_TRIM_EXCESS=0 ++CONFIG_BFIN_GPTIMERS=m ++CONFIG_C_CDPRIO=y ++CONFIG_BANK_3=0xAAC2 ++CONFIG_BINFMT_FLAT=y ++CONFIG_BINFMT_ZFLAT=y ++CONFIG_NET=y ++CONFIG_PACKET=y ++CONFIG_UNIX=y ++CONFIG_INET=y ++CONFIG_IP_PNP=y ++# CONFIG_INET_XFRM_MODE_TRANSPORT is not set ++# CONFIG_INET_XFRM_MODE_TUNNEL is not set ++# CONFIG_INET_XFRM_MODE_BEET is not set ++# CONFIG_INET_LRO is not set ++# CONFIG_INET_DIAG is not set ++# CONFIG_IPV6 is not set ++CONFIG_IRDA=m ++CONFIG_IRLAN=m ++CONFIG_IRCOMM=m ++CONFIG_IRDA_CACHE_LAST_LSAP=y ++CONFIG_IRTTY_SIR=m ++CONFIG_BFIN_SIR=m ++# CONFIG_WIRELESS is not set ++CONFIG_UEVENT_HELPER_PATH="/sbin/hotplug" ++# CONFIG_FW_LOADER is not set ++CONFIG_MTD=y ++CONFIG_MTD_CMDLINE_PARTS=y ++CONFIG_MTD_BLOCK=y ++CONFIG_MTD_CFI=m ++CONFIG_MTD_CFI_AMDSTD=m ++CONFIG_MTD_RAM=y ++CONFIG_MTD_ROM=m ++CONFIG_MTD_COMPLEX_MAPPINGS=y ++CONFIG_BLK_DEV_RAM=y ++CONFIG_NETDEVICES=y ++# CONFIG_NET_VENDOR_BROADCOM is not set ++# CONFIG_NET_VENDOR_CHELSIO is not set ++# CONFIG_NET_VENDOR_INTEL is not set ++# CONFIG_NET_VENDOR_MARVELL is not set ++# CONFIG_NET_VENDOR_MICREL is not set ++# CONFIG_NET_VENDOR_MICROCHIP is not set ++# CONFIG_NET_VENDOR_NATSEMI is not set ++# CONFIG_NET_VENDOR_SEEQ is not set ++CONFIG_SMC91X=y ++# CONFIG_NET_VENDOR_STMICRO is not set ++# CONFIG_WLAN is not set ++# CONFIG_INPUT_MOUSEDEV is not set ++CONFIG_INPUT_EVDEV=m ++# CONFIG_INPUT_KEYBOARD is not set ++# CONFIG_INPUT_MOUSE is not set ++CONFIG_INPUT_MISC=y ++# CONFIG_SERIO is not set ++# CONFIG_VT is not set ++# CONFIG_LEGACY_PTYS is not set ++CONFIG_BFIN_JTAG_COMM=m ++# CONFIG_DEVKMEM is not set ++CONFIG_SERIAL_BFIN=y ++CONFIG_SERIAL_BFIN_CONSOLE=y ++# CONFIG_HW_RANDOM is not set ++CONFIG_I2C=m ++CONFIG_I2C_CHARDEV=m ++CONFIG_I2C_GPIO=m ++CONFIG_SPI=y ++CONFIG_SPI_BFIN5XX=y ++CONFIG_GPIOLIB=y ++CONFIG_GPIO_SYSFS=y ++# CONFIG_HWMON is not set ++CONFIG_WATCHDOG=y ++CONFIG_BFIN_WDT=y ++CONFIG_FB=m ++CONFIG_FIRMWARE_EDID=y ++CONFIG_SOUND=m ++CONFIG_SND=m ++CONFIG_SND_MIXER_OSS=m ++CONFIG_SND_PCM_OSS=m ++CONFIG_SND_SOC=m ++CONFIG_SND_BF5XX_I2S=m ++CONFIG_SND_BF5XX_SOC_AD73311=m ++# CONFIG_USB_SUPPORT is not set ++CONFIG_RTC_CLASS=y ++CONFIG_RTC_DRV_BFIN=y ++# CONFIG_DNOTIFY is not set ++CONFIG_JFFS2_FS=m ++CONFIG_NFS_FS=m ++CONFIG_NFS_V3=y ++CONFIG_DEBUG_SHIRQ=y ++CONFIG_DETECT_HUNG_TASK=y ++CONFIG_DEBUG_INFO=y ++# CONFIG_FTRACE is not set ++CONFIG_DEBUG_MMRS=y ++CONFIG_DEBUG_HWERR=y ++CONFIG_EXACT_HWERR=y ++CONFIG_DEBUG_DOUBLEFAULT=y ++CONFIG_DEBUG_BFIN_HWTRACE_COMPRESSION_ONE=y ++CONFIG_EARLY_PRINTK=y ++CONFIG_CPLB_INFO=y ++CONFIG_BFIN_PSEUDODBG_INSNS=y ++CONFIG_CRYPTO=y ++# CONFIG_CRYPTO_ANSI_CPRNG is not set +diff --git a/arch/blackfin/configs/BF537-STAMP_defconfig b/arch/blackfin/configs/BF537-STAMP_defconfig +new file mode 100644 +index 000000000000..48085fde7f9e +--- /dev/null ++++ b/arch/blackfin/configs/BF537-STAMP_defconfig +@@ -0,0 +1,136 @@ ++CONFIG_EXPERIMENTAL=y ++CONFIG_SYSVIPC=y ++CONFIG_IKCONFIG=y ++CONFIG_IKCONFIG_PROC=y ++CONFIG_LOG_BUF_SHIFT=14 ++CONFIG_BLK_DEV_INITRD=y ++CONFIG_EXPERT=y ++# CONFIG_ELF_CORE is not set ++# CONFIG_FUTEX is not set ++# CONFIG_SIGNALFD is not set ++# CONFIG_TIMERFD is not set ++# CONFIG_EVENTFD is not set ++# CONFIG_AIO is not set ++CONFIG_SLAB=y ++CONFIG_MMAP_ALLOW_UNINITIALIZED=y ++CONFIG_MODULES=y ++CONFIG_MODULE_UNLOAD=y ++# CONFIG_LBDAF is not set ++# CONFIG_BLK_DEV_BSG is not set ++# CONFIG_IOSCHED_DEADLINE is not set ++# CONFIG_IOSCHED_CFQ is not set ++CONFIG_PREEMPT=y ++CONFIG_BF537=y ++CONFIG_HIGH_RES_TIMERS=y ++CONFIG_NOMMU_INITIAL_TRIM_EXCESS=0 ++CONFIG_BFIN_GPTIMERS=m ++CONFIG_C_CDPRIO=y ++CONFIG_BANK_3=0x99B2 ++CONFIG_BINFMT_FLAT=y ++CONFIG_BINFMT_ZFLAT=y ++CONFIG_NET=y ++CONFIG_PACKET=y ++CONFIG_UNIX=y ++CONFIG_INET=y ++CONFIG_IP_PNP=y ++# CONFIG_INET_XFRM_MODE_TRANSPORT is not set ++# CONFIG_INET_XFRM_MODE_TUNNEL is not set ++# CONFIG_INET_XFRM_MODE_BEET is not set ++# CONFIG_INET_LRO is not set ++# CONFIG_INET_DIAG is not set ++# CONFIG_IPV6 is not set ++CONFIG_CAN=m ++CONFIG_CAN_RAW=m ++CONFIG_CAN_BCM=m ++CONFIG_CAN_BFIN=m ++CONFIG_IRDA=m ++CONFIG_IRLAN=m ++CONFIG_IRCOMM=m ++CONFIG_IRDA_CACHE_LAST_LSAP=y ++CONFIG_IRTTY_SIR=m ++CONFIG_BFIN_SIR=m ++CONFIG_BFIN_SIR1=y ++# CONFIG_WIRELESS is not set ++CONFIG_UEVENT_HELPER_PATH="/sbin/hotplug" ++# CONFIG_FW_LOADER is not set ++CONFIG_MTD=y ++CONFIG_MTD_CMDLINE_PARTS=y ++CONFIG_MTD_BLOCK=y ++CONFIG_MTD_CFI=m ++CONFIG_MTD_CFI_AMDSTD=m ++CONFIG_MTD_RAM=y ++CONFIG_MTD_ROM=m ++CONFIG_MTD_PHYSMAP=m ++CONFIG_MTD_M25P80=y ++CONFIG_MTD_SPI_NOR=y ++CONFIG_BLK_DEV_RAM=y ++CONFIG_NETDEVICES=y ++CONFIG_NET_BFIN=y ++CONFIG_BFIN_MAC=y ++# CONFIG_NET_VENDOR_BROADCOM is not set ++# CONFIG_NET_VENDOR_CHELSIO is not set ++# CONFIG_NET_VENDOR_INTEL is not set ++# CONFIG_NET_VENDOR_MARVELL is not set ++# CONFIG_NET_VENDOR_MICREL is not set ++# CONFIG_NET_VENDOR_MICROCHIP is not set ++# CONFIG_NET_VENDOR_NATSEMI is not set ++# CONFIG_NET_VENDOR_SEEQ is not set ++# CONFIG_NET_VENDOR_SMSC is not set ++# CONFIG_NET_VENDOR_STMICRO is not set ++# CONFIG_WLAN is not set ++# CONFIG_INPUT_MOUSEDEV is not set ++CONFIG_INPUT_EVDEV=m ++# CONFIG_INPUT_KEYBOARD is not set ++# CONFIG_INPUT_MOUSE is not set ++CONFIG_INPUT_MISC=y ++# CONFIG_SERIO is not set ++# CONFIG_VT is not set ++# CONFIG_LEGACY_PTYS is not set ++CONFIG_BFIN_JTAG_COMM=m ++# CONFIG_DEVKMEM is not set ++CONFIG_SERIAL_BFIN=y ++CONFIG_SERIAL_BFIN_CONSOLE=y ++CONFIG_SERIAL_BFIN_UART0=y ++# CONFIG_HW_RANDOM is not set ++CONFIG_I2C=m ++CONFIG_I2C_CHARDEV=m ++CONFIG_I2C_BLACKFIN_TWI=m ++CONFIG_I2C_BLACKFIN_TWI_CLK_KHZ=100 ++CONFIG_SPI=y ++CONFIG_SPI_BFIN5XX=y ++CONFIG_GPIOLIB=y ++CONFIG_GPIO_SYSFS=y ++# CONFIG_HWMON is not set ++CONFIG_WATCHDOG=y ++CONFIG_BFIN_WDT=y ++CONFIG_FB=m ++CONFIG_FIRMWARE_EDID=y ++CONFIG_BACKLIGHT_LCD_SUPPORT=y ++CONFIG_SOUND=m ++CONFIG_SND=m ++CONFIG_SND_MIXER_OSS=m ++CONFIG_SND_PCM_OSS=m ++CONFIG_SND_SOC=m ++CONFIG_SND_BF5XX_I2S=m ++CONFIG_SND_BF5XX_SOC_AD73311=m ++# CONFIG_USB_SUPPORT is not set ++CONFIG_RTC_CLASS=y ++CONFIG_RTC_DRV_BFIN=y ++# CONFIG_DNOTIFY is not set ++CONFIG_JFFS2_FS=m ++CONFIG_NFS_FS=m ++CONFIG_NFS_V3=y ++CONFIG_DEBUG_SHIRQ=y ++CONFIG_DETECT_HUNG_TASK=y ++CONFIG_DEBUG_INFO=y ++# CONFIG_FTRACE is not set ++CONFIG_DEBUG_MMRS=y ++CONFIG_DEBUG_HWERR=y ++CONFIG_EXACT_HWERR=y ++CONFIG_DEBUG_DOUBLEFAULT=y ++CONFIG_DEBUG_BFIN_HWTRACE_COMPRESSION_ONE=y ++CONFIG_EARLY_PRINTK=y ++CONFIG_CPLB_INFO=y ++CONFIG_BFIN_PSEUDODBG_INSNS=y ++CONFIG_CRYPTO=y ++# CONFIG_CRYPTO_ANSI_CPRNG is not set +diff --git a/arch/blackfin/configs/BF538-EZKIT_defconfig b/arch/blackfin/configs/BF538-EZKIT_defconfig +new file mode 100644 +index 000000000000..12deeaaef3cb +--- /dev/null ++++ b/arch/blackfin/configs/BF538-EZKIT_defconfig +@@ -0,0 +1,133 @@ ++CONFIG_EXPERIMENTAL=y ++CONFIG_SYSVIPC=y ++CONFIG_IKCONFIG=y ++CONFIG_IKCONFIG_PROC=y ++CONFIG_LOG_BUF_SHIFT=14 ++CONFIG_BLK_DEV_INITRD=y ++# CONFIG_CC_OPTIMIZE_FOR_SIZE is not set ++CONFIG_EXPERT=y ++# CONFIG_SYSCTL_SYSCALL is not set ++# CONFIG_ELF_CORE is not set ++# CONFIG_FUTEX is not set ++# CONFIG_SIGNALFD is not set ++# CONFIG_TIMERFD is not set ++# CONFIG_EVENTFD is not set ++# CONFIG_AIO is not set ++CONFIG_SLAB=y ++CONFIG_MMAP_ALLOW_UNINITIALIZED=y ++CONFIG_MODULES=y ++CONFIG_MODULE_UNLOAD=y ++# CONFIG_LBDAF is not set ++# CONFIG_BLK_DEV_BSG is not set ++# CONFIG_IOSCHED_DEADLINE is not set ++# CONFIG_IOSCHED_CFQ is not set ++CONFIG_PREEMPT=y ++CONFIG_BF538=y ++CONFIG_IRQ_TIMER0=12 ++CONFIG_IRQ_TIMER1=12 ++CONFIG_IRQ_TIMER2=12 ++CONFIG_HIGH_RES_TIMERS=y ++CONFIG_NOMMU_INITIAL_TRIM_EXCESS=0 ++CONFIG_C_CDPRIO=y ++CONFIG_BANK_3=0x99B2 ++CONFIG_BINFMT_FLAT=y ++CONFIG_BINFMT_ZFLAT=y ++CONFIG_PM=y ++CONFIG_NET=y ++CONFIG_PACKET=y ++CONFIG_UNIX=y ++CONFIG_INET=y ++CONFIG_IP_PNP=y ++# CONFIG_INET_XFRM_MODE_TRANSPORT is not set ++# CONFIG_INET_XFRM_MODE_TUNNEL is not set ++# CONFIG_INET_XFRM_MODE_BEET is not set ++# CONFIG_INET_LRO is not set ++# CONFIG_INET_DIAG is not set ++# CONFIG_IPV6 is not set ++CONFIG_CAN=m ++CONFIG_CAN_RAW=m ++CONFIG_CAN_BCM=m ++CONFIG_CAN_DEV=m ++CONFIG_CAN_BFIN=m ++CONFIG_IRDA=m ++CONFIG_IRLAN=m ++CONFIG_IRCOMM=m ++CONFIG_IRDA_CACHE_LAST_LSAP=y ++CONFIG_IRTTY_SIR=m ++CONFIG_BFIN_SIR=m ++# CONFIG_WIRELESS is not set ++CONFIG_UEVENT_HELPER_PATH="/sbin/hotplug" ++# CONFIG_FW_LOADER is not set ++CONFIG_MTD=y ++CONFIG_MTD_CMDLINE_PARTS=y ++CONFIG_MTD_BLOCK=y ++CONFIG_MTD_CFI=m ++CONFIG_MTD_CFI_AMDSTD=m ++CONFIG_MTD_RAM=y ++CONFIG_MTD_ROM=m ++CONFIG_MTD_PHYSMAP=m ++CONFIG_MTD_NAND=m ++CONFIG_BLK_DEV_RAM=y ++CONFIG_NETDEVICES=y ++CONFIG_PHYLIB=y ++CONFIG_SMSC_PHY=y ++CONFIG_NET_ETHERNET=y ++CONFIG_SMC91X=y ++# CONFIG_NETDEV_1000 is not set ++# CONFIG_NETDEV_10000 is not set ++# CONFIG_WLAN is not set ++# CONFIG_INPUT_MOUSEDEV is not set ++CONFIG_INPUT_EVDEV=m ++# CONFIG_INPUT_KEYBOARD is not set ++# CONFIG_INPUT_MOUSE is not set ++CONFIG_INPUT_TOUCHSCREEN=y ++CONFIG_TOUCHSCREEN_AD7879=y ++CONFIG_TOUCHSCREEN_AD7879_SPI=y ++CONFIG_INPUT_MISC=y ++# CONFIG_SERIO is not set ++# CONFIG_VT is not set ++# CONFIG_DEVKMEM is not set ++CONFIG_BFIN_JTAG_COMM=m ++CONFIG_SERIAL_BFIN=y ++CONFIG_SERIAL_BFIN_CONSOLE=y ++CONFIG_SERIAL_BFIN_UART0=y ++CONFIG_SERIAL_BFIN_UART1=y ++CONFIG_SERIAL_BFIN_UART2=y ++# CONFIG_LEGACY_PTYS is not set ++# CONFIG_HW_RANDOM is not set ++CONFIG_I2C=m ++CONFIG_I2C_BLACKFIN_TWI=m ++CONFIG_I2C_BLACKFIN_TWI_CLK_KHZ=100 ++CONFIG_SPI=y ++CONFIG_SPI_BFIN5XX=y ++CONFIG_GPIOLIB=y ++CONFIG_GPIO_SYSFS=y ++# CONFIG_HWMON is not set ++CONFIG_WATCHDOG=y ++CONFIG_BFIN_WDT=y ++CONFIG_FB=m ++CONFIG_FB_BFIN_LQ035Q1=m ++# CONFIG_USB_SUPPORT is not set ++CONFIG_RTC_CLASS=y ++CONFIG_RTC_DRV_BFIN=y ++# CONFIG_DNOTIFY is not set ++CONFIG_JFFS2_FS=m ++CONFIG_NFS_FS=m ++CONFIG_NFS_V3=y ++CONFIG_SMB_FS=m ++CONFIG_DEBUG_KERNEL=y ++CONFIG_DEBUG_SHIRQ=y ++CONFIG_DETECT_HUNG_TASK=y ++CONFIG_DEBUG_INFO=y ++# CONFIG_RCU_CPU_STALL_DETECTOR is not set ++# CONFIG_FTRACE is not set ++CONFIG_DEBUG_MMRS=y ++CONFIG_DEBUG_HWERR=y ++CONFIG_EXACT_HWERR=y ++CONFIG_DEBUG_DOUBLEFAULT=y ++CONFIG_DEBUG_BFIN_HWTRACE_COMPRESSION_ONE=y ++CONFIG_EARLY_PRINTK=y ++CONFIG_CPLB_INFO=y ++CONFIG_BFIN_PSEUDODBG_INSNS=y ++CONFIG_CRYPTO=y ++# CONFIG_CRYPTO_ANSI_CPRNG is not set +diff --git a/arch/blackfin/configs/BF548-EZKIT_defconfig b/arch/blackfin/configs/BF548-EZKIT_defconfig +new file mode 100644 +index 000000000000..6a68ffc55b5a +--- /dev/null ++++ b/arch/blackfin/configs/BF548-EZKIT_defconfig +@@ -0,0 +1,207 @@ ++CONFIG_EXPERIMENTAL=y ++CONFIG_SYSVIPC=y ++CONFIG_IKCONFIG=y ++CONFIG_IKCONFIG_PROC=y ++CONFIG_LOG_BUF_SHIFT=14 ++CONFIG_BLK_DEV_INITRD=y ++CONFIG_EXPERT=y ++# CONFIG_ELF_CORE is not set ++# CONFIG_FUTEX is not set ++# CONFIG_SIGNALFD is not set ++# CONFIG_TIMERFD is not set ++# CONFIG_EVENTFD is not set ++# CONFIG_AIO is not set ++CONFIG_SLAB=y ++CONFIG_MMAP_ALLOW_UNINITIALIZED=y ++CONFIG_MODULES=y ++CONFIG_MODULE_UNLOAD=y ++# CONFIG_LBDAF is not set ++# CONFIG_BLK_DEV_BSG is not set ++# CONFIG_IOSCHED_DEADLINE is not set ++# CONFIG_IOSCHED_CFQ is not set ++CONFIG_PREEMPT=y ++CONFIG_BF548_std=y ++CONFIG_IRQ_TIMER0=11 ++# CONFIG_CYCLES_CLOCKSOURCE is not set ++# CONFIG_SCHEDULE_L1 is not set ++# CONFIG_MEMSET_L1 is not set ++# CONFIG_MEMCPY_L1 is not set ++# CONFIG_SYS_BFIN_SPINLOCK_L1 is not set ++CONFIG_CACHELINE_ALIGNED_L1=y ++CONFIG_NOMMU_INITIAL_TRIM_EXCESS=0 ++CONFIG_BFIN_GPTIMERS=m ++CONFIG_DMA_UNCACHED_2M=y ++CONFIG_BFIN_EXTMEM_WRITETHROUGH=y ++CONFIG_BANK_3=0x99B2 ++CONFIG_EBIU_MBSCTLVAL=0x0 ++CONFIG_EBIU_MODEVAL=0x1 ++CONFIG_EBIU_FCTLVAL=0x6 ++CONFIG_BINFMT_FLAT=y ++CONFIG_BINFMT_ZFLAT=y ++CONFIG_NET=y ++CONFIG_PACKET=y ++CONFIG_UNIX=y ++CONFIG_INET=y ++CONFIG_IP_PNP=y ++# CONFIG_INET_XFRM_MODE_TRANSPORT is not set ++# CONFIG_INET_XFRM_MODE_TUNNEL is not set ++# CONFIG_INET_XFRM_MODE_BEET is not set ++# CONFIG_INET_LRO is not set ++# CONFIG_INET_DIAG is not set ++# CONFIG_IPV6 is not set ++CONFIG_CAN=m ++CONFIG_CAN_RAW=m ++CONFIG_CAN_BCM=m ++CONFIG_CAN_BFIN=m ++CONFIG_IRDA=m ++CONFIG_IRLAN=m ++CONFIG_IRCOMM=m ++CONFIG_IRTTY_SIR=m ++CONFIG_BFIN_SIR=m ++CONFIG_BFIN_SIR3=y ++# CONFIG_WIRELESS is not set ++CONFIG_UEVENT_HELPER_PATH="/sbin/hotplug" ++CONFIG_FW_LOADER=m ++CONFIG_MTD=y ++CONFIG_MTD_CMDLINE_PARTS=y ++CONFIG_MTD_BLOCK=y ++CONFIG_MTD_CFI=y ++CONFIG_MTD_CFI_INTELEXT=y ++CONFIG_MTD_RAM=y ++CONFIG_MTD_COMPLEX_MAPPINGS=y ++CONFIG_MTD_PHYSMAP=y ++CONFIG_MTD_M25P80=y ++CONFIG_MTD_NAND=y ++CONFIG_MTD_NAND_BF5XX=y ++# CONFIG_MTD_NAND_BF5XX_HWECC is not set ++CONFIG_MTD_SPI_NOR=y ++CONFIG_BLK_DEV_RAM=y ++# CONFIG_SCSI_PROC_FS is not set ++CONFIG_BLK_DEV_SD=y ++CONFIG_BLK_DEV_SR=m ++# CONFIG_SCSI_LOWLEVEL is not set ++CONFIG_ATA=y ++# CONFIG_SATA_PMP is not set ++CONFIG_PATA_BF54X=y ++CONFIG_NETDEVICES=y ++# CONFIG_NET_VENDOR_BROADCOM is not set ++# CONFIG_NET_VENDOR_CHELSIO is not set ++# CONFIG_NET_VENDOR_INTEL is not set ++# CONFIG_NET_VENDOR_MARVELL is not set ++# CONFIG_NET_VENDOR_MICREL is not set ++# CONFIG_NET_VENDOR_MICROCHIP is not set ++# CONFIG_NET_VENDOR_NATSEMI is not set ++# CONFIG_NET_VENDOR_SEEQ is not set ++CONFIG_SMSC911X=y ++# CONFIG_NET_VENDOR_STMICRO is not set ++# CONFIG_WLAN is not set ++CONFIG_INPUT_FF_MEMLESS=m ++# CONFIG_INPUT_MOUSEDEV is not set ++CONFIG_INPUT_EVDEV=m ++CONFIG_INPUT_EVBUG=m ++# CONFIG_KEYBOARD_ATKBD is not set ++CONFIG_KEYBOARD_BFIN=y ++# CONFIG_INPUT_MOUSE is not set ++CONFIG_INPUT_TOUCHSCREEN=y ++CONFIG_TOUCHSCREEN_AD7877=m ++CONFIG_INPUT_MISC=y ++# CONFIG_SERIO is not set ++# CONFIG_LEGACY_PTYS is not set ++CONFIG_BFIN_JTAG_COMM=m ++# CONFIG_DEVKMEM is not set ++CONFIG_SERIAL_BFIN=y ++CONFIG_SERIAL_BFIN_CONSOLE=y ++CONFIG_SERIAL_BFIN_UART1=y ++# CONFIG_HW_RANDOM is not set ++CONFIG_I2C=y ++CONFIG_I2C_CHARDEV=y ++CONFIG_I2C_BLACKFIN_TWI=y ++CONFIG_I2C_BLACKFIN_TWI_CLK_KHZ=100 ++CONFIG_SPI=y ++CONFIG_SPI_BFIN5XX=y ++CONFIG_GPIOLIB=y ++CONFIG_GPIO_SYSFS=y ++# CONFIG_HWMON is not set ++CONFIG_WATCHDOG=y ++CONFIG_BFIN_WDT=y ++CONFIG_FB=y ++CONFIG_FIRMWARE_EDID=y ++CONFIG_FB_BF54X_LQ043=y ++CONFIG_FRAMEBUFFER_CONSOLE=y ++CONFIG_FONTS=y ++CONFIG_FONT_6x11=y ++CONFIG_LOGO=y ++# CONFIG_LOGO_LINUX_MONO is not set ++# CONFIG_LOGO_LINUX_VGA16 is not set ++# CONFIG_LOGO_LINUX_CLUT224 is not set ++# CONFIG_LOGO_BLACKFIN_VGA16 is not set ++CONFIG_SOUND=y ++CONFIG_SND=y ++CONFIG_SND_MIXER_OSS=y ++CONFIG_SND_PCM_OSS=y ++CONFIG_SND_SOC=y ++CONFIG_SND_BF5XX_AC97=y ++CONFIG_SND_BF5XX_SOC_AD1980=y ++CONFIG_HID_A4TECH=y ++CONFIG_HID_APPLE=y ++CONFIG_HID_BELKIN=y ++CONFIG_HID_CHERRY=y ++CONFIG_HID_CHICONY=y ++CONFIG_HID_CYPRESS=y ++CONFIG_HID_EZKEY=y ++CONFIG_HID_GYRATION=y ++CONFIG_HID_LOGITECH=y ++CONFIG_HID_MICROSOFT=y ++CONFIG_HID_MONTEREY=y ++CONFIG_HID_PANTHERLORD=y ++CONFIG_HID_PETALYNX=y ++CONFIG_HID_SAMSUNG=y ++CONFIG_HID_SONY=y ++CONFIG_HID_SUNPLUS=y ++CONFIG_USB=y ++# CONFIG_USB_DEVICE_CLASS is not set ++CONFIG_USB_OTG_BLACKLIST_HUB=y ++CONFIG_USB_MON=y ++CONFIG_USB_MUSB_HDRC=y ++CONFIG_USB_MUSB_BLACKFIN=y ++CONFIG_USB_STORAGE=y ++CONFIG_USB_GADGET=y ++CONFIG_MMC=y ++CONFIG_MMC_BLOCK=m ++CONFIG_SDH_BFIN=y ++CONFIG_SDH_BFIN_MISSING_CMD_PULLUP_WORKAROUND=y ++CONFIG_RTC_CLASS=y ++CONFIG_RTC_DRV_BFIN=y ++CONFIG_EXT2_FS=y ++CONFIG_EXT2_FS_XATTR=y ++# CONFIG_DNOTIFY is not set ++CONFIG_ISO9660_FS=m ++CONFIG_JOLIET=y ++CONFIG_ZISOFS=y ++CONFIG_MSDOS_FS=m ++CONFIG_VFAT_FS=m ++CONFIG_NTFS_FS=m ++CONFIG_NTFS_RW=y ++CONFIG_JFFS2_FS=m ++CONFIG_NFS_FS=m ++CONFIG_NFS_V3=y ++CONFIG_NFSD=m ++CONFIG_NFSD_V3=y ++CONFIG_CIFS=y ++CONFIG_NLS_CODEPAGE_437=m ++CONFIG_NLS_CODEPAGE_936=m ++CONFIG_NLS_ISO8859_1=m ++CONFIG_NLS_UTF8=m ++CONFIG_DEBUG_SHIRQ=y ++CONFIG_DETECT_HUNG_TASK=y ++CONFIG_DEBUG_INFO=y ++# CONFIG_FTRACE is not set ++CONFIG_DEBUG_MMRS=y ++CONFIG_DEBUG_HWERR=y ++CONFIG_EXACT_HWERR=y ++CONFIG_DEBUG_DOUBLEFAULT=y ++CONFIG_DEBUG_BFIN_HWTRACE_COMPRESSION_ONE=y ++CONFIG_EARLY_PRINTK=y ++CONFIG_CPLB_INFO=y ++CONFIG_BFIN_PSEUDODBG_INSNS=y ++# CONFIG_CRYPTO_ANSI_CPRNG is not set +diff --git a/arch/blackfin/configs/BF561-ACVILON_defconfig b/arch/blackfin/configs/BF561-ACVILON_defconfig +new file mode 100644 +index 000000000000..e9f3ba783a4e +--- /dev/null ++++ b/arch/blackfin/configs/BF561-ACVILON_defconfig +@@ -0,0 +1,149 @@ ++CONFIG_EXPERIMENTAL=y ++CONFIG_SYSVIPC=y ++CONFIG_IKCONFIG=y ++CONFIG_IKCONFIG_PROC=y ++CONFIG_LOG_BUF_SHIFT=14 ++CONFIG_SYSFS_DEPRECATED_V2=y ++# CONFIG_CC_OPTIMIZE_FOR_SIZE is not set ++CONFIG_EXPERT=y ++# CONFIG_SYSCTL_SYSCALL is not set ++# CONFIG_ELF_CORE is not set ++# CONFIG_FUTEX is not set ++# CONFIG_SIGNALFD is not set ++# CONFIG_TIMERFD is not set ++# CONFIG_EVENTFD is not set ++# CONFIG_AIO is not set ++CONFIG_SLAB=y ++CONFIG_MMAP_ALLOW_UNINITIALIZED=y ++CONFIG_MODULES=y ++CONFIG_MODULE_UNLOAD=y ++# CONFIG_LBDAF is not set ++# CONFIG_BLK_DEV_BSG is not set ++# CONFIG_IOSCHED_DEADLINE is not set ++CONFIG_PREEMPT=y ++CONFIG_BF561=y ++CONFIG_BF_REV_0_5=y ++CONFIG_IRQ_TIMER0=10 ++CONFIG_BFIN561_ACVILON=y ++# CONFIG_BF561_COREB is not set ++CONFIG_CLKIN_HZ=12000000 ++CONFIG_HIGH_RES_TIMERS=y ++CONFIG_NOMMU_INITIAL_TRIM_EXCESS=0 ++CONFIG_BFIN_GPTIMERS=y ++CONFIG_DMA_UNCACHED_4M=y ++CONFIG_C_CDPRIO=y ++CONFIG_BANK_0=0x99b2 ++CONFIG_BANK_1=0x3350 ++CONFIG_BANK_3=0xAAC2 ++CONFIG_BINFMT_FLAT=y ++CONFIG_BINFMT_ZFLAT=y ++CONFIG_NET=y ++CONFIG_PACKET=y ++CONFIG_UNIX=y ++CONFIG_INET=y ++CONFIG_IP_PNP=y ++CONFIG_SYN_COOKIES=y ++# CONFIG_INET_LRO is not set ++# CONFIG_IPV6 is not set ++# CONFIG_WIRELESS is not set ++CONFIG_UEVENT_HELPER_PATH="/sbin/hotplug" ++# CONFIG_FW_LOADER is not set ++CONFIG_MTD=y ++CONFIG_MTD_CMDLINE_PARTS=y ++CONFIG_MTD_BLOCK=y ++CONFIG_MTD_PLATRAM=y ++CONFIG_MTD_PHRAM=y ++CONFIG_MTD_BLOCK2MTD=y ++CONFIG_MTD_NAND=y ++CONFIG_MTD_NAND_PLATFORM=y ++CONFIG_BLK_DEV_LOOP=y ++CONFIG_BLK_DEV_RAM=y ++CONFIG_BLK_DEV_RAM_COUNT=2 ++CONFIG_BLK_DEV_RAM_SIZE=16384 ++CONFIG_SCSI=y ++# CONFIG_SCSI_PROC_FS is not set ++CONFIG_BLK_DEV_SD=y ++# CONFIG_SCSI_LOWLEVEL is not set ++CONFIG_NETDEVICES=y ++CONFIG_NET_ETHERNET=y ++CONFIG_SMSC911X=y ++# CONFIG_NETDEV_1000 is not set ++# CONFIG_NETDEV_10000 is not set ++# CONFIG_WLAN is not set ++# CONFIG_INPUT is not set ++# CONFIG_SERIO is not set ++# CONFIG_VT is not set ++# CONFIG_DEVKMEM is not set ++CONFIG_SERIAL_BFIN=y ++CONFIG_SERIAL_BFIN_CONSOLE=y ++CONFIG_SERIAL_BFIN_PIO=y ++# CONFIG_HW_RANDOM is not set ++CONFIG_I2C=y ++CONFIG_I2C_CHARDEV=y ++CONFIG_I2C_PCA_PLATFORM=y ++CONFIG_SPI=y ++CONFIG_SPI_BFIN5XX=y ++CONFIG_SPI_SPIDEV=y ++CONFIG_GPIOLIB=y ++CONFIG_GPIO_SYSFS=y ++CONFIG_GPIO_PCF857X=y ++CONFIG_SENSORS_LM75=y ++CONFIG_WATCHDOG=y ++CONFIG_BFIN_WDT=y ++CONFIG_SOUND=y ++CONFIG_SND=y ++CONFIG_SND_MIXER_OSS=y ++CONFIG_SND_PCM_OSS=y ++# CONFIG_SND_DRIVERS is not set ++# CONFIG_SND_USB is not set ++CONFIG_SND_SOC=y ++CONFIG_SND_BF5XX_I2S=y ++CONFIG_SND_BF5XX_SPORT_NUM=1 ++CONFIG_USB=y ++CONFIG_USB_ANNOUNCE_NEW_DEVICES=y ++# CONFIG_USB_DEVICE_CLASS is not set ++CONFIG_USB_MON=y ++CONFIG_USB_STORAGE=y ++CONFIG_USB_SERIAL=y ++CONFIG_USB_SERIAL_FTDI_SIO=y ++CONFIG_USB_SERIAL_PL2303=y ++CONFIG_RTC_CLASS=y ++CONFIG_RTC_DRV_DS1307=y ++CONFIG_EXT2_FS=y ++CONFIG_EXT2_FS_XATTR=y ++CONFIG_EXT2_FS_POSIX_ACL=y ++CONFIG_EXT2_FS_SECURITY=y ++# CONFIG_DNOTIFY is not set ++CONFIG_MSDOS_FS=y ++CONFIG_VFAT_FS=y ++CONFIG_FAT_DEFAULT_CODEPAGE=866 ++CONFIG_FAT_DEFAULT_IOCHARSET="cp1251" ++CONFIG_NTFS_FS=y ++CONFIG_CONFIGFS_FS=y ++CONFIG_JFFS2_FS=y ++CONFIG_JFFS2_COMPRESSION_OPTIONS=y ++# CONFIG_JFFS2_ZLIB is not set ++CONFIG_JFFS2_LZO=y ++# CONFIG_JFFS2_RTIME is not set ++CONFIG_JFFS2_CMODE_FAVOURLZO=y ++CONFIG_CRAMFS=y ++CONFIG_MINIX_FS=y ++CONFIG_NFS_FS=y ++CONFIG_NFS_V3=y ++CONFIG_ROOT_NFS=y ++CONFIG_NLS_DEFAULT="cp1251" ++CONFIG_NLS_CODEPAGE_866=y ++CONFIG_NLS_CODEPAGE_1251=y ++CONFIG_NLS_KOI8_R=y ++CONFIG_NLS_UTF8=y ++CONFIG_DEBUG_KERNEL=y ++CONFIG_DEBUG_SHIRQ=y ++CONFIG_DETECT_HUNG_TASK=y ++# CONFIG_DEBUG_BUGVERBOSE is not set ++CONFIG_DEBUG_INFO=y ++# CONFIG_RCU_CPU_STALL_DETECTOR is not set ++# CONFIG_FTRACE is not set ++CONFIG_DEBUG_MMRS=y ++# CONFIG_DEBUG_BFIN_NO_KERN_HWTRACE is not set ++CONFIG_CPLB_INFO=y ++# CONFIG_CRYPTO_ANSI_CPRNG is not set +diff --git a/arch/blackfin/configs/BF561-EZKIT-SMP_defconfig b/arch/blackfin/configs/BF561-EZKIT-SMP_defconfig +new file mode 100644 +index 000000000000..89b75a6c3fab +--- /dev/null ++++ b/arch/blackfin/configs/BF561-EZKIT-SMP_defconfig +@@ -0,0 +1,112 @@ ++CONFIG_EXPERIMENTAL=y ++CONFIG_SYSVIPC=y ++CONFIG_IKCONFIG=y ++CONFIG_IKCONFIG_PROC=y ++CONFIG_LOG_BUF_SHIFT=14 ++CONFIG_BLK_DEV_INITRD=y ++CONFIG_EXPERT=y ++# CONFIG_ELF_CORE is not set ++# CONFIG_FUTEX is not set ++# CONFIG_SIGNALFD is not set ++# CONFIG_TIMERFD is not set ++# CONFIG_EVENTFD is not set ++# CONFIG_AIO is not set ++CONFIG_SLAB=y ++CONFIG_MMAP_ALLOW_UNINITIALIZED=y ++CONFIG_MODULES=y ++CONFIG_MODULE_UNLOAD=y ++# CONFIG_LBDAF is not set ++# CONFIG_BLK_DEV_BSG is not set ++# CONFIG_IOSCHED_DEADLINE is not set ++# CONFIG_IOSCHED_CFQ is not set ++CONFIG_PREEMPT=y ++CONFIG_BF561=y ++CONFIG_SMP=y ++CONFIG_IRQ_TIMER0=10 ++CONFIG_CLKIN_HZ=30000000 ++CONFIG_HIGH_RES_TIMERS=y ++CONFIG_NOMMU_INITIAL_TRIM_EXCESS=0 ++CONFIG_BFIN_GPTIMERS=m ++CONFIG_C_CDPRIO=y ++CONFIG_BANK_3=0xAAC2 ++CONFIG_BINFMT_FLAT=y ++CONFIG_BINFMT_ZFLAT=y ++CONFIG_NET=y ++CONFIG_PACKET=y ++CONFIG_UNIX=y ++CONFIG_INET=y ++CONFIG_IP_PNP=y ++# CONFIG_INET_XFRM_MODE_TRANSPORT is not set ++# CONFIG_INET_XFRM_MODE_TUNNEL is not set ++# CONFIG_INET_XFRM_MODE_BEET is not set ++# CONFIG_INET_LRO is not set ++# CONFIG_INET_DIAG is not set ++# CONFIG_IPV6 is not set ++CONFIG_IRDA=m ++CONFIG_IRLAN=m ++CONFIG_IRCOMM=m ++CONFIG_IRDA_CACHE_LAST_LSAP=y ++CONFIG_IRTTY_SIR=m ++# CONFIG_WIRELESS is not set ++CONFIG_UEVENT_HELPER_PATH="/sbin/hotplug" ++# CONFIG_FW_LOADER is not set ++CONFIG_MTD=y ++CONFIG_MTD_CMDLINE_PARTS=y ++CONFIG_MTD_BLOCK=y ++CONFIG_MTD_CFI=y ++CONFIG_MTD_CFI_AMDSTD=y ++CONFIG_MTD_RAM=y ++CONFIG_MTD_ROM=m ++CONFIG_MTD_PHYSMAP=y ++CONFIG_BLK_DEV_RAM=y ++CONFIG_NETDEVICES=y ++# CONFIG_NET_VENDOR_BROADCOM is not set ++# CONFIG_NET_VENDOR_CHELSIO is not set ++# CONFIG_NET_VENDOR_INTEL is not set ++# CONFIG_NET_VENDOR_MARVELL is not set ++# CONFIG_NET_VENDOR_MICREL is not set ++# CONFIG_NET_VENDOR_MICROCHIP is not set ++# CONFIG_NET_VENDOR_NATSEMI is not set ++# CONFIG_NET_VENDOR_SEEQ is not set ++CONFIG_SMC91X=y ++# CONFIG_NET_VENDOR_STMICRO is not set ++# CONFIG_WLAN is not set ++CONFIG_INPUT=m ++# CONFIG_INPUT_MOUSEDEV is not set ++CONFIG_INPUT_EVDEV=m ++# CONFIG_INPUT_KEYBOARD is not set ++# CONFIG_INPUT_MOUSE is not set ++# CONFIG_SERIO is not set ++# CONFIG_VT is not set ++# CONFIG_LEGACY_PTYS is not set ++CONFIG_BFIN_JTAG_COMM=m ++# CONFIG_DEVKMEM is not set ++CONFIG_SERIAL_BFIN=y ++CONFIG_SERIAL_BFIN_CONSOLE=y ++# CONFIG_HW_RANDOM is not set ++CONFIG_SPI=y ++CONFIG_SPI_BFIN5XX=y ++CONFIG_GPIOLIB=y ++CONFIG_GPIO_SYSFS=y ++# CONFIG_HWMON is not set ++CONFIG_WATCHDOG=y ++CONFIG_BFIN_WDT=y ++# CONFIG_USB_SUPPORT is not set ++# CONFIG_DNOTIFY is not set ++CONFIG_JFFS2_FS=m ++CONFIG_NFS_FS=m ++CONFIG_NFS_V3=y ++CONFIG_DEBUG_SHIRQ=y ++CONFIG_DETECT_HUNG_TASK=y ++CONFIG_DEBUG_INFO=y ++# CONFIG_FTRACE is not set ++CONFIG_DEBUG_MMRS=y ++CONFIG_DEBUG_HWERR=y ++CONFIG_EXACT_HWERR=y ++CONFIG_DEBUG_DOUBLEFAULT=y ++CONFIG_DEBUG_BFIN_HWTRACE_COMPRESSION_ONE=y ++CONFIG_EARLY_PRINTK=y ++CONFIG_CPLB_INFO=y ++CONFIG_BFIN_PSEUDODBG_INSNS=y ++CONFIG_CRYPTO=y ++# CONFIG_CRYPTO_ANSI_CPRNG is not set +diff --git a/arch/blackfin/configs/BF561-EZKIT_defconfig b/arch/blackfin/configs/BF561-EZKIT_defconfig +new file mode 100644 +index 000000000000..67b3d2f419ba +--- /dev/null ++++ b/arch/blackfin/configs/BF561-EZKIT_defconfig +@@ -0,0 +1,114 @@ ++CONFIG_EXPERIMENTAL=y ++CONFIG_SYSVIPC=y ++CONFIG_IKCONFIG=y ++CONFIG_IKCONFIG_PROC=y ++CONFIG_LOG_BUF_SHIFT=14 ++CONFIG_BLK_DEV_INITRD=y ++CONFIG_EXPERT=y ++# CONFIG_ELF_CORE is not set ++# CONFIG_FUTEX is not set ++# CONFIG_SIGNALFD is not set ++# CONFIG_TIMERFD is not set ++# CONFIG_EVENTFD is not set ++# CONFIG_AIO is not set ++CONFIG_SLAB=y ++CONFIG_MMAP_ALLOW_UNINITIALIZED=y ++CONFIG_MODULES=y ++CONFIG_MODULE_UNLOAD=y ++# CONFIG_LBDAF is not set ++# CONFIG_BLK_DEV_BSG is not set ++# CONFIG_IOSCHED_DEADLINE is not set ++# CONFIG_IOSCHED_CFQ is not set ++CONFIG_PREEMPT=y ++CONFIG_BF561=y ++CONFIG_IRQ_TIMER0=10 ++CONFIG_CLKIN_HZ=30000000 ++CONFIG_HIGH_RES_TIMERS=y ++CONFIG_NOMMU_INITIAL_TRIM_EXCESS=0 ++CONFIG_BFIN_GPTIMERS=m ++CONFIG_BFIN_EXTMEM_WRITETHROUGH=y ++CONFIG_BFIN_L2_DCACHEABLE=y ++CONFIG_BFIN_L2_WRITETHROUGH=y ++CONFIG_C_CDPRIO=y ++CONFIG_BANK_3=0xAAC2 ++CONFIG_BINFMT_FLAT=y ++CONFIG_BINFMT_ZFLAT=y ++CONFIG_NET=y ++CONFIG_PACKET=y ++CONFIG_UNIX=y ++CONFIG_INET=y ++CONFIG_IP_PNP=y ++# CONFIG_INET_XFRM_MODE_TRANSPORT is not set ++# CONFIG_INET_XFRM_MODE_TUNNEL is not set ++# CONFIG_INET_XFRM_MODE_BEET is not set ++# CONFIG_INET_LRO is not set ++# CONFIG_INET_DIAG is not set ++# CONFIG_IPV6 is not set ++CONFIG_IRDA=m ++CONFIG_IRLAN=m ++CONFIG_IRCOMM=m ++CONFIG_IRDA_CACHE_LAST_LSAP=y ++CONFIG_IRTTY_SIR=m ++# CONFIG_WIRELESS is not set ++CONFIG_UEVENT_HELPER_PATH="/sbin/hotplug" ++# CONFIG_FW_LOADER is not set ++CONFIG_MTD=y ++CONFIG_MTD_CMDLINE_PARTS=y ++CONFIG_MTD_BLOCK=y ++CONFIG_MTD_CFI=y ++CONFIG_MTD_CFI_AMDSTD=y ++CONFIG_MTD_RAM=y ++CONFIG_MTD_ROM=m ++CONFIG_MTD_PHYSMAP=y ++CONFIG_BLK_DEV_RAM=y ++CONFIG_NETDEVICES=y ++# CONFIG_NET_VENDOR_BROADCOM is not set ++# CONFIG_NET_VENDOR_CHELSIO is not set ++# CONFIG_NET_VENDOR_INTEL is not set ++# CONFIG_NET_VENDOR_MARVELL is not set ++# CONFIG_NET_VENDOR_MICREL is not set ++# CONFIG_NET_VENDOR_MICROCHIP is not set ++# CONFIG_NET_VENDOR_NATSEMI is not set ++# CONFIG_NET_VENDOR_SEEQ is not set ++CONFIG_SMC91X=y ++# CONFIG_NET_VENDOR_STMICRO is not set ++# CONFIG_WLAN is not set ++CONFIG_INPUT=m ++# CONFIG_INPUT_MOUSEDEV is not set ++CONFIG_INPUT_EVDEV=m ++# CONFIG_INPUT_KEYBOARD is not set ++# CONFIG_INPUT_MOUSE is not set ++# CONFIG_SERIO is not set ++# CONFIG_VT is not set ++# CONFIG_LEGACY_PTYS is not set ++CONFIG_BFIN_JTAG_COMM=m ++# CONFIG_DEVKMEM is not set ++CONFIG_SERIAL_BFIN=y ++CONFIG_SERIAL_BFIN_CONSOLE=y ++# CONFIG_HW_RANDOM is not set ++CONFIG_SPI=y ++CONFIG_SPI_BFIN5XX=y ++CONFIG_GPIOLIB=y ++CONFIG_GPIO_SYSFS=y ++# CONFIG_HWMON is not set ++CONFIG_WATCHDOG=y ++CONFIG_BFIN_WDT=y ++# CONFIG_USB_SUPPORT is not set ++# CONFIG_DNOTIFY is not set ++CONFIG_JFFS2_FS=m ++CONFIG_NFS_FS=m ++CONFIG_NFS_V3=y ++CONFIG_DEBUG_SHIRQ=y ++CONFIG_DETECT_HUNG_TASK=y ++CONFIG_DEBUG_INFO=y ++# CONFIG_FTRACE is not set ++CONFIG_DEBUG_MMRS=y ++CONFIG_DEBUG_HWERR=y ++CONFIG_EXACT_HWERR=y ++CONFIG_DEBUG_DOUBLEFAULT=y ++CONFIG_DEBUG_BFIN_HWTRACE_COMPRESSION_ONE=y ++CONFIG_EARLY_PRINTK=y ++CONFIG_CPLB_INFO=y ++CONFIG_BFIN_PSEUDODBG_INSNS=y ++CONFIG_CRYPTO=y ++# CONFIG_CRYPTO_ANSI_CPRNG is not set +diff --git a/arch/blackfin/configs/BF609-EZKIT_defconfig b/arch/blackfin/configs/BF609-EZKIT_defconfig +new file mode 100644 +index 000000000000..8cc75d4218fb +--- /dev/null ++++ b/arch/blackfin/configs/BF609-EZKIT_defconfig +@@ -0,0 +1,154 @@ ++CONFIG_EXPERIMENTAL=y ++CONFIG_SYSVIPC=y ++CONFIG_HIGH_RES_TIMERS=y ++CONFIG_IKCONFIG=y ++CONFIG_IKCONFIG_PROC=y ++CONFIG_LOG_BUF_SHIFT=14 ++CONFIG_BLK_DEV_INITRD=y ++CONFIG_EXPERT=y ++# CONFIG_ELF_CORE is not set ++# CONFIG_FUTEX is not set ++# CONFIG_SIGNALFD is not set ++# CONFIG_TIMERFD is not set ++# CONFIG_EVENTFD is not set ++# CONFIG_AIO is not set ++CONFIG_SLAB=y ++CONFIG_MMAP_ALLOW_UNINITIALIZED=y ++CONFIG_MODULES=y ++CONFIG_MODULE_UNLOAD=y ++# CONFIG_LBDAF is not set ++# CONFIG_BLK_DEV_BSG is not set ++# CONFIG_IOSCHED_DEADLINE is not set ++# CONFIG_IOSCHED_CFQ is not set ++CONFIG_PREEMPT=y ++CONFIG_BF609=y ++CONFIG_PINT1_ASSIGN=0x01010000 ++CONFIG_PINT2_ASSIGN=0x07000101 ++CONFIG_PINT3_ASSIGN=0x02020303 ++CONFIG_IP_CHECKSUM_L1=y ++CONFIG_SYSCALL_TAB_L1=y ++CONFIG_CPLB_SWITCH_TAB_L1=y ++# CONFIG_APP_STACK_L1 is not set ++# CONFIG_BFIN_INS_LOWOVERHEAD is not set ++CONFIG_NOMMU_INITIAL_TRIM_EXCESS=0 ++CONFIG_BINFMT_FLAT=y ++CONFIG_BINFMT_ZFLAT=y ++CONFIG_PM_BFIN_WAKE_PE12=y ++CONFIG_PM_BFIN_WAKE_PE12_POL=1 ++CONFIG_CPU_FREQ=y ++CONFIG_CPU_FREQ_GOV_POWERSAVE=y ++CONFIG_CPU_FREQ_GOV_ONDEMAND=y ++CONFIG_NET=y ++CONFIG_PACKET=y ++CONFIG_UNIX=y ++CONFIG_INET=y ++CONFIG_IP_PNP=y ++CONFIG_IP_PNP_DHCP=y ++CONFIG_IP_PNP_BOOTP=y ++CONFIG_IP_PNP_RARP=y ++# CONFIG_IPV6 is not set ++CONFIG_NETFILTER=y ++CONFIG_CAN=y ++CONFIG_CAN_BFIN=y ++CONFIG_IRDA=y ++CONFIG_IRTTY_SIR=y ++# CONFIG_WIRELESS is not set ++CONFIG_UEVENT_HELPER_PATH="/sbin/hotplug" ++CONFIG_FW_LOADER=m ++CONFIG_MTD=y ++CONFIG_MTD_CMDLINE_PARTS=y ++CONFIG_MTD_BLOCK=y ++CONFIG_MTD_CFI=y ++CONFIG_MTD_CFI_INTELEXT=y ++CONFIG_MTD_CFI_STAA=y ++CONFIG_MTD_COMPLEX_MAPPINGS=y ++CONFIG_MTD_PHYSMAP=y ++CONFIG_MTD_M25P80=y ++CONFIG_MTD_SPI_NOR=y ++CONFIG_MTD_UBI=m ++CONFIG_SCSI=y ++CONFIG_BLK_DEV_SD=y ++CONFIG_NETDEVICES=y ++# CONFIG_NET_VENDOR_BROADCOM is not set ++# CONFIG_NET_VENDOR_CHELSIO is not set ++# CONFIG_NET_VENDOR_INTEL is not set ++# CONFIG_NET_VENDOR_MARVELL is not set ++# CONFIG_NET_VENDOR_MICREL is not set ++# CONFIG_NET_VENDOR_MICROCHIP is not set ++# CONFIG_NET_VENDOR_NATSEMI is not set ++# CONFIG_NET_VENDOR_SEEQ is not set ++# CONFIG_NET_VENDOR_SMSC is not set ++CONFIG_STMMAC_ETH=y ++CONFIG_STMMAC_IEEE1588=y ++# CONFIG_WLAN is not set ++# CONFIG_INPUT_MOUSEDEV is not set ++CONFIG_INPUT_EVDEV=y ++# CONFIG_INPUT_KEYBOARD is not set ++# CONFIG_INPUT_MOUSE is not set ++CONFIG_INPUT_MISC=y ++CONFIG_INPUT_BFIN_ROTARY=y ++# CONFIG_SERIO is not set ++# CONFIG_LEGACY_PTYS is not set ++CONFIG_BFIN_SIMPLE_TIMER=m ++# CONFIG_BFIN_CRC is not set ++CONFIG_BFIN_LINKPORT=y ++# CONFIG_DEVKMEM is not set ++CONFIG_SERIAL_BFIN=y ++CONFIG_SERIAL_BFIN_CONSOLE=y ++CONFIG_SERIAL_BFIN_UART0=y ++# CONFIG_HW_RANDOM is not set ++CONFIG_I2C=y ++CONFIG_I2C_CHARDEV=y ++CONFIG_I2C_BLACKFIN_TWI=y ++CONFIG_I2C_BLACKFIN_TWI_CLK_KHZ=100 ++CONFIG_SPI=y ++CONFIG_SPI_ADI_V3=y ++CONFIG_GPIOLIB=y ++CONFIG_GPIO_SYSFS=y ++CONFIG_PINCTRL_MCP23S08=y ++# CONFIG_HWMON is not set ++CONFIG_WATCHDOG=y ++CONFIG_BFIN_WDT=y ++CONFIG_SOUND=m ++CONFIG_SND=m ++CONFIG_SND_MIXER_OSS=m ++CONFIG_SND_PCM_OSS=m ++# CONFIG_SND_DRIVERS is not set ++# CONFIG_SND_SPI is not set ++# CONFIG_SND_USB is not set ++CONFIG_SND_SOC=m ++CONFIG_USB=y ++CONFIG_USB_MUSB_HDRC=y ++CONFIG_USB_MUSB_BLACKFIN=m ++CONFIG_USB_STORAGE=y ++CONFIG_USB_GADGET=y ++CONFIG_USB_GADGET_MUSB_HDRC=y ++CONFIG_USB_ZERO=y ++CONFIG_MMC=y ++CONFIG_SDH_BFIN=y ++# CONFIG_IOMMU_SUPPORT is not set ++CONFIG_EXT2_FS=y ++# CONFIG_DNOTIFY is not set ++CONFIG_MSDOS_FS=y ++CONFIG_VFAT_FS=y ++CONFIG_JFFS2_FS=m ++CONFIG_UBIFS_FS=m ++CONFIG_NFS_FS=m ++CONFIG_NLS_CODEPAGE_437=y ++CONFIG_NLS_ISO8859_1=y ++CONFIG_DEBUG_FS=y ++CONFIG_DEBUG_SHIRQ=y ++CONFIG_DETECT_HUNG_TASK=y ++CONFIG_DEBUG_INFO=y ++CONFIG_FRAME_POINTER=y ++# CONFIG_FTRACE is not set ++CONFIG_DEBUG_BFIN_HWTRACE_COMPRESSION_ONE=y ++CONFIG_EARLY_PRINTK=y ++CONFIG_CPLB_INFO=y ++CONFIG_BFIN_PSEUDODBG_INSNS=y ++CONFIG_CRYPTO_HMAC=m ++CONFIG_CRYPTO_MD4=m ++CONFIG_CRYPTO_MD5=m ++CONFIG_CRYPTO_ARC4=m ++# CONFIG_CRYPTO_ANSI_CPRNG is not set ++CONFIG_CRYPTO_DEV_BFIN_CRC=m +diff --git a/arch/blackfin/configs/BlackStamp_defconfig b/arch/blackfin/configs/BlackStamp_defconfig +new file mode 100644 +index 000000000000..9faf0ec7007f +--- /dev/null ++++ b/arch/blackfin/configs/BlackStamp_defconfig +@@ -0,0 +1,108 @@ ++CONFIG_EXPERIMENTAL=y ++CONFIG_SYSVIPC=y ++CONFIG_IKCONFIG=y ++CONFIG_IKCONFIG_PROC=y ++CONFIG_LOG_BUF_SHIFT=14 ++CONFIG_SYSFS_DEPRECATED_V2=y ++CONFIG_BLK_DEV_INITRD=y ++# CONFIG_CC_OPTIMIZE_FOR_SIZE is not set ++CONFIG_EXPERT=y ++# CONFIG_SYSCTL_SYSCALL is not set ++# CONFIG_ELF_CORE is not set ++# CONFIG_FUTEX is not set ++CONFIG_SLAB=y ++CONFIG_MMAP_ALLOW_UNINITIALIZED=y ++CONFIG_MODULES=y ++CONFIG_MODULE_UNLOAD=y ++CONFIG_MODULE_FORCE_UNLOAD=y ++# CONFIG_BLK_DEV_BSG is not set ++# CONFIG_IOSCHED_DEADLINE is not set ++CONFIG_PREEMPT=y ++CONFIG_BF532=y ++CONFIG_BF_REV_0_5=y ++CONFIG_BLACKSTAMP=y ++CONFIG_TIMER0=11 ++# CONFIG_CYCLES_CLOCKSOURCE is not set ++CONFIG_HIGH_RES_TIMERS=y ++CONFIG_ROMKERNEL=y ++CONFIG_NOMMU_INITIAL_TRIM_EXCESS=0 ++CONFIG_BFIN_GPTIMERS=y ++CONFIG_C_CDPRIO=y ++CONFIG_BANK_3=0xAAC2 ++CONFIG_BINFMT_FLAT=y ++CONFIG_BINFMT_ZFLAT=y ++CONFIG_BINFMT_SHARED_FLAT=y ++CONFIG_PM=y ++CONFIG_NET=y ++CONFIG_PACKET=y ++CONFIG_UNIX=y ++CONFIG_INET=y ++CONFIG_IP_PNP=y ++# CONFIG_INET_LRO is not set ++# CONFIG_IPV6 is not set ++# CONFIG_WIRELESS is not set ++CONFIG_UEVENT_HELPER_PATH="/sbin/hotplug" ++# CONFIG_FW_LOADER is not set ++CONFIG_MTD=y ++CONFIG_MTD_CMDLINE_PARTS=y ++CONFIG_MTD_BLOCK=y ++CONFIG_MTD_CFI=m ++CONFIG_MTD_CFI_AMDSTD=m ++CONFIG_MTD_RAM=y ++CONFIG_MTD_ROM=m ++CONFIG_MTD_COMPLEX_MAPPINGS=y ++CONFIG_MTD_M25P80=y ++CONFIG_MTD_SPI_NOR=y ++CONFIG_BLK_DEV_LOOP=y ++CONFIG_BLK_DEV_NBD=y ++CONFIG_BLK_DEV_RAM=y ++CONFIG_MISC_DEVICES=y ++CONFIG_EEPROM_AT25=y ++CONFIG_NETDEVICES=y ++CONFIG_NET_ETHERNET=y ++CONFIG_SMC91X=y ++# CONFIG_NETDEV_1000 is not set ++# CONFIG_NETDEV_10000 is not set ++# CONFIG_WLAN is not set ++# CONFIG_INPUT_MOUSEDEV is not set ++CONFIG_INPUT_EVDEV=m ++# CONFIG_INPUT_KEYBOARD is not set ++# CONFIG_INPUT_MOUSE is not set ++# CONFIG_SERIO is not set ++# CONFIG_VT is not set ++CONFIG_SERIAL_BFIN=y ++CONFIG_SERIAL_BFIN_CONSOLE=y ++# CONFIG_LEGACY_PTYS is not set ++CONFIG_HW_RANDOM=y ++CONFIG_I2C=m ++CONFIG_I2C_CHARDEV=m ++CONFIG_I2C_GPIO=m ++CONFIG_SPI=y ++CONFIG_SPI_BFIN5XX=y ++CONFIG_SPI_SPIDEV=m ++# CONFIG_HWMON is not set ++CONFIG_WATCHDOG=y ++CONFIG_BFIN_WDT=y ++# CONFIG_USB_SUPPORT is not set ++CONFIG_MMC=y ++CONFIG_MMC_SPI=y ++CONFIG_RTC_CLASS=y ++CONFIG_RTC_DRV_BFIN=y ++# CONFIG_DNOTIFY is not set ++CONFIG_MSDOS_FS=y ++CONFIG_VFAT_FS=y ++CONFIG_JFFS2_FS=y ++CONFIG_NFS_FS=y ++CONFIG_NFS_V3=y ++CONFIG_NFS_V4=y ++CONFIG_SMB_FS=y ++CONFIG_CIFS=y ++CONFIG_NLS_CODEPAGE_437=y ++CONFIG_NLS_ASCII=y ++CONFIG_NLS_UTF8=y ++CONFIG_SYSCTL_SYSCALL_CHECK=y ++CONFIG_DEBUG_MMRS=y ++# CONFIG_DEBUG_BFIN_NO_KERN_HWTRACE is not set ++CONFIG_EARLY_PRINTK=y ++CONFIG_CPLB_INFO=y ++CONFIG_CRC_CCITT=m +diff --git a/arch/blackfin/configs/CM-BF527_defconfig b/arch/blackfin/configs/CM-BF527_defconfig +new file mode 100644 +index 000000000000..4a1ad4fd7bb2 +--- /dev/null ++++ b/arch/blackfin/configs/CM-BF527_defconfig +@@ -0,0 +1,129 @@ ++CONFIG_EXPERIMENTAL=y ++CONFIG_KERNEL_LZMA=y ++CONFIG_SYSVIPC=y ++CONFIG_IKCONFIG=y ++CONFIG_IKCONFIG_PROC=y ++CONFIG_LOG_BUF_SHIFT=14 ++CONFIG_BLK_DEV_INITRD=y ++# CONFIG_RD_GZIP is not set ++CONFIG_RD_LZMA=y ++# CONFIG_CC_OPTIMIZE_FOR_SIZE is not set ++CONFIG_EXPERT=y ++# CONFIG_SYSCTL_SYSCALL is not set ++# CONFIG_ELF_CORE is not set ++# CONFIG_FUTEX is not set ++# CONFIG_AIO is not set ++CONFIG_SLAB=y ++CONFIG_MMAP_ALLOW_UNINITIALIZED=y ++CONFIG_MODULES=y ++CONFIG_MODULE_UNLOAD=y ++# CONFIG_BLK_DEV_BSG is not set ++# CONFIG_IOSCHED_DEADLINE is not set ++CONFIG_PREEMPT=y ++CONFIG_BF527=y ++CONFIG_BF_REV_0_1=y ++CONFIG_IRQ_TIMER0=12 ++CONFIG_BFIN527_BLUETECHNIX_CM=y ++CONFIG_IRQ_USB_INT0=11 ++CONFIG_IRQ_USB_INT1=11 ++CONFIG_IRQ_USB_INT2=11 ++CONFIG_IRQ_USB_DMA=11 ++# CONFIG_CYCLES_CLOCKSOURCE is not set ++# CONFIG_SCHEDULE_L1 is not set ++# CONFIG_MEMSET_L1 is not set ++# CONFIG_MEMCPY_L1 is not set ++# CONFIG_SYS_BFIN_SPINLOCK_L1 is not set ++CONFIG_NOMMU_INITIAL_TRIM_EXCESS=0 ++CONFIG_BFIN_GPTIMERS=y ++CONFIG_C_CDPRIO=y ++CONFIG_BANK_3=0xFFC0 ++CONFIG_BINFMT_FLAT=y ++CONFIG_BINFMT_ZFLAT=y ++CONFIG_NET=y ++CONFIG_PACKET=y ++CONFIG_UNIX=y ++CONFIG_INET=y ++CONFIG_IP_PNP=y ++# CONFIG_INET_XFRM_MODE_TRANSPORT is not set ++# CONFIG_INET_XFRM_MODE_TUNNEL is not set ++# CONFIG_INET_XFRM_MODE_BEET is not set ++# CONFIG_INET_LRO is not set ++# CONFIG_INET_DIAG is not set ++# CONFIG_IPV6 is not set ++# CONFIG_WIRELESS is not set ++CONFIG_UEVENT_HELPER_PATH="/sbin/hotplug" ++# CONFIG_FW_LOADER is not set ++CONFIG_MTD=y ++CONFIG_MTD_CMDLINE_PARTS=y ++CONFIG_MTD_BLOCK=y ++CONFIG_MTD_CFI=y ++CONFIG_MTD_CFI_INTELEXT=y ++CONFIG_MTD_RAM=y ++CONFIG_MTD_ROM=m ++CONFIG_MTD_COMPLEX_MAPPINGS=y ++CONFIG_MTD_GPIO_ADDR=y ++CONFIG_BLK_DEV_RAM=y ++CONFIG_SCSI=y ++CONFIG_BLK_DEV_SD=y ++# CONFIG_SCSI_LOWLEVEL is not set ++CONFIG_NETDEVICES=y ++CONFIG_NET_ETHERNET=y ++CONFIG_BFIN_MAC=y ++# CONFIG_NETDEV_1000 is not set ++# CONFIG_NETDEV_10000 is not set ++# CONFIG_WLAN is not set ++# CONFIG_INPUT is not set ++# CONFIG_SERIO is not set ++# CONFIG_VT is not set ++# CONFIG_DEVKMEM is not set ++CONFIG_SERIAL_BFIN=y ++CONFIG_SERIAL_BFIN_CONSOLE=y ++CONFIG_SERIAL_BFIN_UART0=y ++CONFIG_SERIAL_BFIN_UART1=y ++# CONFIG_LEGACY_PTYS is not set ++# CONFIG_HW_RANDOM is not set ++CONFIG_I2C=y ++CONFIG_I2C_CHARDEV=m ++CONFIG_I2C_BLACKFIN_TWI=m ++CONFIG_I2C_BLACKFIN_TWI_CLK_KHZ=100 ++CONFIG_SPI=y ++CONFIG_SPI_BFIN5XX=y ++CONFIG_GPIOLIB=y ++CONFIG_GPIO_SYSFS=y ++CONFIG_WATCHDOG=y ++CONFIG_BFIN_WDT=y ++CONFIG_USB=m ++CONFIG_USB_ANNOUNCE_NEW_DEVICES=y ++# CONFIG_USB_DEVICE_CLASS is not set ++CONFIG_USB_OTG_BLACKLIST_HUB=y ++CONFIG_USB_MON=m ++CONFIG_USB_MUSB_HDRC=m ++CONFIG_USB_MUSB_PERIPHERAL=y ++CONFIG_USB_GADGET_MUSB_HDRC=y ++CONFIG_MUSB_PIO_ONLY=y ++CONFIG_USB_STORAGE=m ++CONFIG_USB_GADGET=m ++CONFIG_USB_ETH=m ++CONFIG_USB_MASS_STORAGE=m ++CONFIG_USB_G_SERIAL=m ++CONFIG_USB_G_PRINTER=m ++CONFIG_RTC_CLASS=y ++CONFIG_RTC_DRV_BFIN=y ++# CONFIG_DNOTIFY is not set ++CONFIG_MSDOS_FS=y ++CONFIG_VFAT_FS=y ++CONFIG_JFFS2_FS=y ++CONFIG_NFS_FS=m ++CONFIG_NFS_V3=y ++CONFIG_SMB_FS=m ++CONFIG_NLS_CODEPAGE_437=y ++CONFIG_NLS_ISO8859_1=y ++CONFIG_DEBUG_FS=y ++# CONFIG_RCU_CPU_STALL_DETECTOR is not set ++# CONFIG_DEBUG_BFIN_NO_KERN_HWTRACE is not set ++CONFIG_EARLY_PRINTK=y ++CONFIG_CRYPTO=y ++# CONFIG_CRYPTO_ANSI_CPRNG is not set ++CONFIG_CRC_CCITT=m ++CONFIG_CRC_ITU_T=y ++CONFIG_CRC7=y +diff --git a/arch/blackfin/configs/PNAV-10_defconfig b/arch/blackfin/configs/PNAV-10_defconfig +new file mode 100644 +index 000000000000..9d787e28bbe8 +--- /dev/null ++++ b/arch/blackfin/configs/PNAV-10_defconfig +@@ -0,0 +1,111 @@ ++CONFIG_EXPERIMENTAL=y ++CONFIG_SYSVIPC=y ++CONFIG_LOG_BUF_SHIFT=14 ++# CONFIG_CC_OPTIMIZE_FOR_SIZE is not set ++CONFIG_EXPERT=y ++# CONFIG_SYSCTL_SYSCALL is not set ++# CONFIG_ELF_CORE is not set ++# CONFIG_FUTEX is not set ++# CONFIG_AIO is not set ++CONFIG_SLAB=y ++CONFIG_MMAP_ALLOW_UNINITIALIZED=y ++CONFIG_MODULES=y ++CONFIG_MODULE_UNLOAD=y ++# CONFIG_LBDAF is not set ++# CONFIG_BLK_DEV_BSG is not set ++# CONFIG_IOSCHED_DEADLINE is not set ++# CONFIG_IOSCHED_CFQ is not set ++CONFIG_PREEMPT=y ++CONFIG_BF537=y ++CONFIG_IRQ_TIMER0=12 ++CONFIG_PNAV10=y ++# CONFIG_CYCLES_CLOCKSOURCE is not set ++CONFIG_IP_CHECKSUM_L1=y ++CONFIG_SYSCALL_TAB_L1=y ++CONFIG_CPLB_SWITCH_TAB_L1=y ++CONFIG_NOMMU_INITIAL_TRIM_EXCESS=0 ++CONFIG_BFIN_GPTIMERS=y ++CONFIG_C_CDPRIO=y ++CONFIG_BANK_1=0x33B0 ++CONFIG_BANK_2=0x33B0 ++CONFIG_BANK_3=0x99B2 ++CONFIG_BINFMT_FLAT=y ++CONFIG_BINFMT_ZFLAT=y ++CONFIG_NET=y ++CONFIG_PACKET=y ++CONFIG_UNIX=y ++CONFIG_INET=y ++CONFIG_IP_PNP=y ++# CONFIG_INET_XFRM_MODE_TRANSPORT is not set ++# CONFIG_INET_XFRM_MODE_TUNNEL is not set ++# CONFIG_INET_XFRM_MODE_BEET is not set ++# CONFIG_INET_LRO is not set ++# CONFIG_INET_DIAG is not set ++# CONFIG_IPV6 is not set ++# CONFIG_WIRELESS is not set ++CONFIG_UEVENT_HELPER_PATH="/sbin/hotplug" ++# CONFIG_FW_LOADER is not set ++CONFIG_MTD=y ++CONFIG_MTD_BLOCK=y ++CONFIG_MTD_RAM=y ++CONFIG_MTD_COMPLEX_MAPPINGS=y ++CONFIG_MTD_UCLINUX=y ++CONFIG_MTD_NAND=y ++CONFIG_BLK_DEV_RAM=y ++CONFIG_NETDEVICES=y ++CONFIG_NET_ETHERNET=y ++CONFIG_BFIN_MAC=y ++# CONFIG_BFIN_MAC_USE_L1 is not set ++CONFIG_BFIN_TX_DESC_NUM=100 ++CONFIG_BFIN_RX_DESC_NUM=100 ++# CONFIG_NETDEV_1000 is not set ++# CONFIG_NETDEV_10000 is not set ++# CONFIG_WLAN is not set ++# CONFIG_INPUT_MOUSEDEV is not set ++CONFIG_INPUT_EVDEV=y ++# CONFIG_INPUT_KEYBOARD is not set ++# CONFIG_INPUT_MOUSE is not set ++CONFIG_INPUT_TOUCHSCREEN=y ++CONFIG_TOUCHSCREEN_AD7877=y ++CONFIG_INPUT_MISC=y ++CONFIG_INPUT_UINPUT=y ++# CONFIG_SERIO is not set ++# CONFIG_VT is not set ++CONFIG_SERIAL_BFIN=y ++CONFIG_SERIAL_BFIN_CONSOLE=y ++CONFIG_SERIAL_BFIN_UART0=y ++CONFIG_SERIAL_BFIN_UART1=y ++# CONFIG_LEGACY_PTYS is not set ++CONFIG_HW_RANDOM=y ++CONFIG_I2C=y ++CONFIG_I2C_CHARDEV=y ++CONFIG_I2C_BLACKFIN_TWI=y ++CONFIG_I2C_BLACKFIN_TWI_CLK_KHZ=100 ++CONFIG_SPI=y ++CONFIG_SPI_BFIN5XX=y ++CONFIG_FB=y ++CONFIG_FIRMWARE_EDID=y ++CONFIG_BACKLIGHT_LCD_SUPPORT=y ++CONFIG_LCD_CLASS_DEVICE=y ++CONFIG_BACKLIGHT_CLASS_DEVICE=y ++CONFIG_SOUND=y ++CONFIG_SND=m ++# CONFIG_SND_SUPPORT_OLD_API is not set ++# CONFIG_SND_VERBOSE_PROCFS is not set ++CONFIG_SOUND_PRIME=y ++# CONFIG_HID is not set ++CONFIG_RTC_CLASS=y ++CONFIG_RTC_DRV_BFIN=y ++CONFIG_EXT2_FS=y ++CONFIG_EXT2_FS_XATTR=y ++# CONFIG_DNOTIFY is not set ++CONFIG_NFS_FS=m ++CONFIG_NFS_V3=y ++CONFIG_SMB_FS=m ++# CONFIG_RCU_CPU_STALL_DETECTOR is not set ++# CONFIG_DEBUG_HUNT_FOR_ZERO is not set ++# CONFIG_DEBUG_BFIN_NO_KERN_HWTRACE is not set ++# CONFIG_ACCESS_CHECK is not set ++CONFIG_CRYPTO=y ++# CONFIG_CRYPTO_ANSI_CPRNG is not set ++CONFIG_CRC_CCITT=m +diff --git a/arch/blackfin/configs/SRV1_defconfig b/arch/blackfin/configs/SRV1_defconfig +new file mode 100644 +index 000000000000..225df32dc9a8 +--- /dev/null ++++ b/arch/blackfin/configs/SRV1_defconfig +@@ -0,0 +1,88 @@ ++CONFIG_EXPERIMENTAL=y ++CONFIG_SYSVIPC=y ++CONFIG_LOG_BUF_SHIFT=14 ++CONFIG_BLK_DEV_INITRD=y ++# CONFIG_CC_OPTIMIZE_FOR_SIZE is not set ++CONFIG_EXPERT=y ++# CONFIG_SYSCTL_SYSCALL is not set ++CONFIG_KALLSYMS_ALL=y ++# CONFIG_ELF_CORE is not set ++# CONFIG_FUTEX is not set ++CONFIG_SLAB=y ++CONFIG_MMAP_ALLOW_UNINITIALIZED=y ++CONFIG_MODULES=y ++CONFIG_MODULE_UNLOAD=y ++# CONFIG_IOSCHED_DEADLINE is not set ++CONFIG_PREEMPT=y ++CONFIG_BF537=y ++CONFIG_IRQ_TIMER0=12 ++CONFIG_BOOT_LOAD=0x400000 ++CONFIG_CLKIN_HZ=22118400 ++CONFIG_NOMMU_INITIAL_TRIM_EXCESS=0 ++CONFIG_DMA_UNCACHED_2M=y ++CONFIG_C_CDPRIO=y ++CONFIG_BINFMT_FLAT=y ++CONFIG_BINFMT_ZFLAT=y ++CONFIG_PM=y ++CONFIG_NET=y ++CONFIG_PACKET=y ++CONFIG_UNIX=y ++CONFIG_INET=y ++CONFIG_IP_PNP=y ++# CONFIG_IPV6 is not set ++CONFIG_IRDA=m ++CONFIG_IRLAN=m ++CONFIG_IRCOMM=m ++CONFIG_IRDA_CACHE_LAST_LSAP=y ++CONFIG_IRTTY_SIR=m ++# CONFIG_WIRELESS is not set ++# CONFIG_FW_LOADER is not set ++CONFIG_MTD=y ++CONFIG_MTD_BLOCK=y ++CONFIG_MTD_JEDECPROBE=m ++CONFIG_MTD_RAM=y ++CONFIG_MTD_ROM=m ++CONFIG_MTD_COMPLEX_MAPPINGS=y ++CONFIG_MTD_UCLINUX=y ++CONFIG_MTD_NAND=m ++CONFIG_BLK_DEV_RAM=y ++CONFIG_MISC_DEVICES=y ++CONFIG_EEPROM_AT25=m ++CONFIG_NETDEVICES=y ++# CONFIG_NETDEV_1000 is not set ++# CONFIG_NETDEV_10000 is not set ++# CONFIG_WLAN is not set ++# CONFIG_INPUT_MOUSEDEV is not set ++CONFIG_INPUT_EVDEV=m ++# CONFIG_INPUT_KEYBOARD is not set ++# CONFIG_INPUT_MOUSE is not set ++CONFIG_INPUT_MISC=y ++CONFIG_INPUT_UINPUT=y ++# CONFIG_SERIO is not set ++# CONFIG_VT is not set ++CONFIG_SERIAL_BFIN=y ++CONFIG_SERIAL_BFIN_CONSOLE=y ++CONFIG_SERIAL_BFIN_UART0=y ++# CONFIG_LEGACY_PTYS is not set ++CONFIG_I2C=y ++CONFIG_I2C_CHARDEV=y ++CONFIG_I2C_BLACKFIN_TWI=y ++CONFIG_I2C_BLACKFIN_TWI_CLK_KHZ=100 ++CONFIG_SPI=y ++CONFIG_SPI_BFIN5XX=y ++CONFIG_HWMON=m ++CONFIG_WATCHDOG=y ++CONFIG_BFIN_WDT=y ++# CONFIG_HID is not set ++CONFIG_EXT2_FS=y ++CONFIG_EXT2_FS_XATTR=y ++# CONFIG_DNOTIFY is not set ++CONFIG_JFFS2_FS=m ++CONFIG_NFS_FS=m ++CONFIG_NFS_V3=y ++CONFIG_SMB_FS=m ++CONFIG_DEBUG_KERNEL=y ++# CONFIG_DEBUG_BUGVERBOSE is not set ++CONFIG_DEBUG_INFO=y ++# CONFIG_DEBUG_BFIN_NO_KERN_HWTRACE is not set ++CONFIG_CPLB_INFO=y +diff --git a/arch/blackfin/configs/TCM-BF518_defconfig b/arch/blackfin/configs/TCM-BF518_defconfig +new file mode 100644 +index 000000000000..425c24e43c34 +--- /dev/null ++++ b/arch/blackfin/configs/TCM-BF518_defconfig +@@ -0,0 +1,131 @@ ++CONFIG_EXPERIMENTAL=y ++CONFIG_KERNEL_LZMA=y ++CONFIG_SYSVIPC=y ++CONFIG_IKCONFIG=y ++CONFIG_IKCONFIG_PROC=y ++CONFIG_LOG_BUF_SHIFT=14 ++CONFIG_BLK_DEV_INITRD=y ++# CONFIG_RD_GZIP is not set ++CONFIG_RD_LZMA=y ++CONFIG_EXPERT=y ++# CONFIG_SYSCTL_SYSCALL is not set ++# CONFIG_ELF_CORE is not set ++# CONFIG_FUTEX is not set ++# CONFIG_SIGNALFD is not set ++# CONFIG_TIMERFD is not set ++# CONFIG_EVENTFD is not set ++# CONFIG_AIO is not set ++CONFIG_SLAB=y ++CONFIG_MMAP_ALLOW_UNINITIALIZED=y ++CONFIG_MODULES=y ++CONFIG_MODULE_UNLOAD=y ++# CONFIG_LBDAF is not set ++# CONFIG_BLK_DEV_BSG is not set ++# CONFIG_IOSCHED_DEADLINE is not set ++# CONFIG_IOSCHED_CFQ is not set ++CONFIG_PREEMPT=y ++CONFIG_BF518=y ++CONFIG_BF_REV_0_1=y ++CONFIG_BFIN518F_TCM=y ++CONFIG_IRQ_TIMER0=12 ++# CONFIG_CYCLES_CLOCKSOURCE is not set ++# CONFIG_SCHEDULE_L1 is not set ++# CONFIG_MEMSET_L1 is not set ++# CONFIG_MEMCPY_L1 is not set ++# CONFIG_SYS_BFIN_SPINLOCK_L1 is not set ++CONFIG_NOMMU_INITIAL_TRIM_EXCESS=0 ++CONFIG_BFIN_GPTIMERS=m ++CONFIG_C_CDPRIO=y ++CONFIG_BANK_3=0x99B2 ++CONFIG_BINFMT_FLAT=y ++CONFIG_BINFMT_ZFLAT=y ++CONFIG_NET=y ++CONFIG_PACKET=y ++CONFIG_UNIX=y ++CONFIG_INET=y ++CONFIG_IP_PNP=y ++# CONFIG_INET_XFRM_MODE_TRANSPORT is not set ++# CONFIG_INET_XFRM_MODE_TUNNEL is not set ++# CONFIG_INET_XFRM_MODE_BEET is not set ++# CONFIG_INET_LRO is not set ++# CONFIG_INET_DIAG is not set ++# CONFIG_IPV6 is not set ++# CONFIG_WIRELESS is not set ++CONFIG_UEVENT_HELPER_PATH="/sbin/hotplug" ++# CONFIG_FW_LOADER is not set ++CONFIG_MTD=y ++CONFIG_MTD_CMDLINE_PARTS=y ++CONFIG_MTD_BLOCK=y ++CONFIG_MTD_CFI=y ++CONFIG_MTD_CFI_ADV_OPTIONS=y ++CONFIG_MTD_CFI_GEOMETRY=y ++# CONFIG_MTD_MAP_BANK_WIDTH_1 is not set ++# CONFIG_MTD_MAP_BANK_WIDTH_4 is not set ++# CONFIG_MTD_CFI_I2 is not set ++CONFIG_MTD_CFI_INTELEXT=y ++CONFIG_MTD_RAM=y ++CONFIG_MTD_ROM=m ++CONFIG_MTD_PHYSMAP=y ++CONFIG_BLK_DEV_RAM=y ++CONFIG_NETDEVICES=y ++CONFIG_NET_ETHERNET=y ++CONFIG_BFIN_MAC=y ++# CONFIG_NETDEV_1000 is not set ++# CONFIG_NETDEV_10000 is not set ++# CONFIG_WLAN is not set ++# CONFIG_INPUT_MOUSEDEV is not set ++# CONFIG_INPUT_KEYBOARD is not set ++# CONFIG_INPUT_MOUSE is not set ++CONFIG_INPUT_MISC=y ++# CONFIG_SERIO is not set ++# CONFIG_DEVKMEM is not set ++CONFIG_BFIN_JTAG_COMM=m ++CONFIG_SERIAL_BFIN=y ++CONFIG_SERIAL_BFIN_CONSOLE=y ++CONFIG_SERIAL_BFIN_UART0=y ++# CONFIG_LEGACY_PTYS is not set ++# CONFIG_HW_RANDOM is not set ++CONFIG_I2C=y ++CONFIG_I2C_CHARDEV=y ++CONFIG_I2C_BLACKFIN_TWI=y ++CONFIG_I2C_BLACKFIN_TWI_CLK_KHZ=100 ++CONFIG_SPI=y ++CONFIG_SPI_BFIN5XX=y ++CONFIG_GPIOLIB=y ++CONFIG_GPIO_SYSFS=y ++# CONFIG_HWMON is not set ++CONFIG_WATCHDOG=y ++CONFIG_BFIN_WDT=y ++# CONFIG_HID_SUPPORT is not set ++# CONFIG_USB_SUPPORT is not set ++CONFIG_MMC=y ++CONFIG_MMC_DEBUG=y ++CONFIG_MMC_SPI=y ++CONFIG_RTC_CLASS=y ++CONFIG_RTC_DRV_BFIN=y ++CONFIG_EXT2_FS=y ++# CONFIG_DNOTIFY is not set ++CONFIG_VFAT_FS=m ++# CONFIG_MISC_FILESYSTEMS is not set ++CONFIG_NFS_FS=y ++CONFIG_NFS_V3=y ++CONFIG_ROOT_NFS=y ++CONFIG_NLS_CODEPAGE_437=m ++CONFIG_NLS_ISO8859_1=m ++CONFIG_NLS_UTF8=m ++CONFIG_DEBUG_KERNEL=y ++CONFIG_DEBUG_SHIRQ=y ++CONFIG_DETECT_HUNG_TASK=y ++CONFIG_DEBUG_INFO=y ++# CONFIG_RCU_CPU_STALL_DETECTOR is not set ++# CONFIG_FTRACE is not set ++CONFIG_DEBUG_MMRS=y ++CONFIG_DEBUG_HWERR=y ++CONFIG_EXACT_HWERR=y ++CONFIG_DEBUG_DOUBLEFAULT=y ++CONFIG_DEBUG_BFIN_HWTRACE_COMPRESSION_ONE=y ++CONFIG_EARLY_PRINTK=y ++CONFIG_CPLB_INFO=y ++CONFIG_CRYPTO=y ++# CONFIG_CRYPTO_ANSI_CPRNG is not set ++CONFIG_CRC_CCITT=m +diff --git a/arch/mips/configs/fuloong2e_defconfig b/arch/mips/configs/fuloong2e_defconfig +index 7a7af706e898..be19bf122fde 100644 +--- a/arch/mips/configs/fuloong2e_defconfig ++++ b/arch/mips/configs/fuloong2e_defconfig +@@ -4,7 +4,7 @@ CONFIG_SYSVIPC=y + CONFIG_POSIX_MQUEUE=y + CONFIG_NO_HZ=y + CONFIG_HIGH_RES_TIMERS=y +-CONFIG_PREEMPT_VOLUNTARY=y ++CONFIG_PREEMPT=y + CONFIG_BSD_PROCESS_ACCT=y + CONFIG_IKCONFIG=y + CONFIG_IKCONFIG_PROC=y +diff --git a/arch/mips/configs/gpr_defconfig b/arch/mips/configs/gpr_defconfig +index 9085f4d6c698..fb23111d45f6 100644 +--- a/arch/mips/configs/gpr_defconfig ++++ b/arch/mips/configs/gpr_defconfig +@@ -1,8 +1,8 @@ ++CONFIG_PREEMPT=y + # CONFIG_LOCALVERSION_AUTO is not set + CONFIG_SYSVIPC=y + CONFIG_POSIX_MQUEUE=y + CONFIG_HIGH_RES_TIMERS=y +-CONFIG_PREEMPT_VOLUNTARY=y + CONFIG_BSD_PROCESS_ACCT=y + CONFIG_BSD_PROCESS_ACCT_V3=y + CONFIG_RELAY=y +diff --git a/arch/mips/configs/ip22_defconfig b/arch/mips/configs/ip22_defconfig +index 21a1168ae301..529a1b1007cf 100644 +--- a/arch/mips/configs/ip22_defconfig ++++ b/arch/mips/configs/ip22_defconfig +@@ -1,7 +1,7 @@ + CONFIG_SYSVIPC=y + CONFIG_NO_HZ=y + CONFIG_HIGH_RES_TIMERS=y +-CONFIG_PREEMPT_VOLUNTARY=y ++CONFIG_PREEMPT=y + CONFIG_IKCONFIG=y + CONFIG_IKCONFIG_PROC=y + CONFIG_LOG_BUF_SHIFT=14 +diff --git a/arch/mips/configs/ip28_defconfig b/arch/mips/configs/ip28_defconfig +index 0921ef38e9fb..6da05cef46f8 100644 +--- a/arch/mips/configs/ip28_defconfig ++++ b/arch/mips/configs/ip28_defconfig +@@ -1,5 +1,5 @@ + CONFIG_SYSVIPC=y +-CONFIG_PREEMPT_VOLUNTARY=y ++CONFIG_PREEMPT=y + CONFIG_IKCONFIG=y + CONFIG_IKCONFIG_PROC=y + CONFIG_LOG_BUF_SHIFT=14 +diff --git a/arch/mips/configs/jazz_defconfig b/arch/mips/configs/jazz_defconfig +index 328d4dfeb4cb..e17cb23173ea 100644 +--- a/arch/mips/configs/jazz_defconfig ++++ b/arch/mips/configs/jazz_defconfig +@@ -1,6 +1,6 @@ ++CONFIG_PREEMPT=y + CONFIG_SYSVIPC=y + CONFIG_POSIX_MQUEUE=y +-CONFIG_PREEMPT_VOLUNTARY=y + CONFIG_BSD_PROCESS_ACCT=y + CONFIG_IKCONFIG=y + CONFIG_IKCONFIG_PROC=y +diff --git a/arch/mips/configs/mtx1_defconfig b/arch/mips/configs/mtx1_defconfig +index 914af125a7fa..76a64290373f 100644 +--- a/arch/mips/configs/mtx1_defconfig ++++ b/arch/mips/configs/mtx1_defconfig +@@ -1,8 +1,8 @@ ++CONFIG_PREEMPT=y + # CONFIG_LOCALVERSION_AUTO is not set + CONFIG_SYSVIPC=y + CONFIG_POSIX_MQUEUE=y + CONFIG_AUDIT=y +-CONFIG_PREEMPT_VOLUNTARY=y + CONFIG_BSD_PROCESS_ACCT=y + CONFIG_BSD_PROCESS_ACCT_V3=y + CONFIG_RELAY=y +diff --git a/arch/mips/configs/nlm_xlr_defconfig b/arch/mips/configs/nlm_xlr_defconfig +index 4ecb157e56d4..ea7309283b01 100644 +--- a/arch/mips/configs/nlm_xlr_defconfig ++++ b/arch/mips/configs/nlm_xlr_defconfig +@@ -1,10 +1,10 @@ ++CONFIG_PREEMPT=y + # CONFIG_LOCALVERSION_AUTO is not set + CONFIG_SYSVIPC=y + CONFIG_POSIX_MQUEUE=y + CONFIG_AUDIT=y + CONFIG_NO_HZ=y + CONFIG_HIGH_RES_TIMERS=y +-CONFIG_PREEMPT_VOLUNTARY=y + CONFIG_BSD_PROCESS_ACCT=y + CONFIG_BSD_PROCESS_ACCT_V3=y + CONFIG_TASKSTATS=y +diff --git a/arch/mips/configs/pic32mzda_defconfig b/arch/mips/configs/pic32mzda_defconfig +index 63fe2da1b37f..7f08ee237345 100644 +--- a/arch/mips/configs/pic32mzda_defconfig ++++ b/arch/mips/configs/pic32mzda_defconfig +@@ -1,7 +1,7 @@ ++CONFIG_PREEMPT=y + CONFIG_SYSVIPC=y + CONFIG_NO_HZ=y + CONFIG_HIGH_RES_TIMERS=y +-CONFIG_PREEMPT_VOLUNTARY=y + CONFIG_IKCONFIG=y + CONFIG_IKCONFIG_PROC=y + CONFIG_LOG_BUF_SHIFT=14 +diff --git a/arch/mips/configs/pistachio_defconfig b/arch/mips/configs/pistachio_defconfig +index 24e07180c57d..38582e8f71c4 100644 +--- a/arch/mips/configs/pistachio_defconfig ++++ b/arch/mips/configs/pistachio_defconfig +@@ -1,9 +1,9 @@ ++CONFIG_PREEMPT=y + # CONFIG_LOCALVERSION_AUTO is not set + CONFIG_DEFAULT_HOSTNAME="localhost" + CONFIG_SYSVIPC=y + CONFIG_NO_HZ=y + CONFIG_HIGH_RES_TIMERS=y +-CONFIG_PREEMPT_VOLUNTARY=y + CONFIG_IKCONFIG=m + CONFIG_IKCONFIG_PROC=y + CONFIG_LOG_BUF_SHIFT=18 +diff --git a/arch/mips/configs/pnx8335_stb225_defconfig b/arch/mips/configs/pnx8335_stb225_defconfig +index 738ba3b1374b..6a3267e8aa0d 100644 +--- a/arch/mips/configs/pnx8335_stb225_defconfig ++++ b/arch/mips/configs/pnx8335_stb225_defconfig +@@ -1,9 +1,9 @@ ++CONFIG_PREEMPT=y + # CONFIG_LOCALVERSION_AUTO is not set + # CONFIG_SWAP is not set + CONFIG_SYSVIPC=y + CONFIG_NO_HZ=y + CONFIG_HIGH_RES_TIMERS=y +-CONFIG_PREEMPT_VOLUNTARY=y + CONFIG_LOG_BUF_SHIFT=14 + CONFIG_EXPERT=y + CONFIG_SLAB=y +diff --git a/arch/mips/configs/rm200_defconfig b/arch/mips/configs/rm200_defconfig +index 2c7adea7638f..1c82d62bee72 100644 +--- a/arch/mips/configs/rm200_defconfig ++++ b/arch/mips/configs/rm200_defconfig +@@ -1,6 +1,6 @@ ++CONFIG_PREEMPT=y + CONFIG_SYSVIPC=y + CONFIG_POSIX_MQUEUE=y +-CONFIG_PREEMPT_VOLUNTARY=y + CONFIG_BSD_PROCESS_ACCT=y + CONFIG_IKCONFIG=y + CONFIG_IKCONFIG_PROC=y +diff --git a/arch/parisc/configs/712_defconfig b/arch/parisc/configs/712_defconfig +index d3e3d94e90c3..578524f80cc4 100644 +--- a/arch/parisc/configs/712_defconfig ++++ b/arch/parisc/configs/712_defconfig +@@ -13,7 +13,7 @@ CONFIG_MODULES=y + CONFIG_MODULE_UNLOAD=y + CONFIG_MODULE_FORCE_UNLOAD=y + CONFIG_PA7100LC=y +-CONFIG_PREEMPT_VOLUNTARY=y ++CONFIG_PREEMPT=y + CONFIG_GSC_LASI=y + # CONFIG_PDC_CHASSIS is not set + CONFIG_BINFMT_MISC=m +diff --git a/arch/parisc/configs/c3000_defconfig b/arch/parisc/configs/c3000_defconfig +index 64d45a8b6ca0..d1bdfad94048 100644 +--- a/arch/parisc/configs/c3000_defconfig ++++ b/arch/parisc/configs/c3000_defconfig +@@ -13,7 +13,7 @@ CONFIG_MODULES=y + CONFIG_MODULE_UNLOAD=y + CONFIG_MODULE_FORCE_UNLOAD=y + CONFIG_PA8X00=y +-CONFIG_PREEMPT_VOLUNTARY=y ++CONFIG_PREEMPT=y + # CONFIG_GSC is not set + CONFIG_PCI=y + CONFIG_PCI_LBA=y +diff --git a/arch/parisc/configs/defconfig b/arch/parisc/configs/defconfig +index 5b877ca34ebf..0d976614934c 100644 +--- a/arch/parisc/configs/defconfig ++++ b/arch/parisc/configs/defconfig +@@ -14,7 +14,7 @@ CONFIG_MODULE_UNLOAD=y + CONFIG_MODULE_FORCE_UNLOAD=y + # CONFIG_BLK_DEV_BSG is not set + CONFIG_PA7100LC=y +-CONFIG_PREEMPT_VOLUNTARY=y ++CONFIG_PREEMPT=y + CONFIG_IOMMU_CCIO=y + CONFIG_GSC_LASI=y + CONFIG_GSC_WAX=y +diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig +index 3e56c9c2f16e..ecee9c2a0062 100644 +--- a/arch/powerpc/Kconfig ++++ b/arch/powerpc/Kconfig +@@ -853,6 +853,8 @@ config SCHED_SMT + when dealing with POWER5 cpus at a cost of slightly increased + overhead in some places. If unsure say N here. + ++source "kernel/Kconfig.MuQSS" ++ + config PPC_DENORMALISATION + bool "PowerPC denormalisation exception handling" + depends on PPC_BOOK3S_64 +diff --git a/arch/powerpc/configs/c2k_defconfig b/arch/powerpc/configs/c2k_defconfig +new file mode 100644 +index 000000000000..04fee07ea6c5 +--- /dev/null ++++ b/arch/powerpc/configs/c2k_defconfig +@@ -0,0 +1,389 @@ ++CONFIG_SYSVIPC=y ++CONFIG_POSIX_MQUEUE=y ++CONFIG_AUDIT=y ++CONFIG_BSD_PROCESS_ACCT=y ++CONFIG_BLK_DEV_INITRD=y ++CONFIG_PROFILING=y ++CONFIG_OPROFILE=m ++CONFIG_KPROBES=y ++CONFIG_MODULES=y ++CONFIG_MODULE_UNLOAD=y ++CONFIG_MODVERSIONS=y ++CONFIG_PARTITION_ADVANCED=y ++CONFIG_OSF_PARTITION=y ++CONFIG_MAC_PARTITION=y ++CONFIG_BSD_DISKLABEL=y ++CONFIG_MINIX_SUBPARTITION=y ++CONFIG_SOLARIS_X86_PARTITION=y ++CONFIG_UNIXWARE_DISKLABEL=y ++CONFIG_SGI_PARTITION=y ++CONFIG_SUN_PARTITION=y ++# CONFIG_PPC_CHRP is not set ++# CONFIG_PPC_PMAC is not set ++CONFIG_EMBEDDED6xx=y ++CONFIG_PPC_C2K=y ++CONFIG_CPU_FREQ=y ++CONFIG_CPU_FREQ_DEFAULT_GOV_USERSPACE=y ++CONFIG_CPU_FREQ_GOV_PERFORMANCE=y ++CONFIG_CPU_FREQ_GOV_POWERSAVE=m ++CONFIG_CPU_FREQ_GOV_ONDEMAND=m ++CONFIG_GEN_RTC=y ++CONFIG_HIGHMEM=y ++CONFIG_PREEMPT=y ++CONFIG_BINFMT_MISC=y ++CONFIG_PM=y ++CONFIG_PCI_MSI=y ++CONFIG_HOTPLUG_PCI=y ++CONFIG_HOTPLUG_PCI_SHPC=m ++CONFIG_NET=y ++CONFIG_PACKET=y ++CONFIG_UNIX=y ++CONFIG_XFRM_USER=y ++CONFIG_NET_KEY=m ++CONFIG_INET=y ++CONFIG_IP_MULTICAST=y ++CONFIG_IP_ADVANCED_ROUTER=y ++CONFIG_IP_MULTIPLE_TABLES=y ++CONFIG_IP_ROUTE_MULTIPATH=y ++CONFIG_IP_ROUTE_VERBOSE=y ++CONFIG_IP_PNP=y ++CONFIG_IP_PNP_DHCP=y ++CONFIG_NET_IPIP=m ++CONFIG_IP_MROUTE=y ++CONFIG_IP_PIMSM_V1=y ++CONFIG_IP_PIMSM_V2=y ++CONFIG_SYN_COOKIES=y ++CONFIG_INET_AH=m ++CONFIG_INET_ESP=m ++CONFIG_INET_IPCOMP=m ++CONFIG_INET6_AH=m ++CONFIG_INET6_ESP=m ++CONFIG_INET6_IPCOMP=m ++CONFIG_IPV6_TUNNEL=m ++CONFIG_NETFILTER=y ++# CONFIG_NETFILTER_XT_MATCH_SCTP is not set ++CONFIG_IP_NF_IPTABLES=m ++CONFIG_IP_NF_MATCH_ECN=m ++CONFIG_IP_NF_MATCH_TTL=m ++CONFIG_IP_NF_FILTER=m ++CONFIG_IP_NF_TARGET_REJECT=m ++CONFIG_IP_NF_MANGLE=m ++CONFIG_IP_NF_TARGET_ECN=m ++CONFIG_IP_NF_RAW=m ++CONFIG_IP_NF_ARPTABLES=m ++CONFIG_IP_NF_ARPFILTER=m ++CONFIG_IP_NF_ARP_MANGLE=m ++CONFIG_IP6_NF_IPTABLES=m ++CONFIG_IP6_NF_MATCH_EUI64=m ++CONFIG_IP6_NF_MATCH_FRAG=m ++CONFIG_IP6_NF_MATCH_OPTS=m ++CONFIG_IP6_NF_MATCH_HL=m ++CONFIG_IP6_NF_MATCH_IPV6HEADER=m ++CONFIG_IP6_NF_MATCH_RT=m ++CONFIG_IP6_NF_FILTER=m ++CONFIG_IP6_NF_MANGLE=m ++CONFIG_IP6_NF_RAW=m ++CONFIG_BRIDGE_NF_EBTABLES=m ++CONFIG_BRIDGE_EBT_BROUTE=m ++CONFIG_BRIDGE_EBT_T_FILTER=m ++CONFIG_BRIDGE_EBT_T_NAT=m ++CONFIG_BRIDGE_EBT_802_3=m ++CONFIG_BRIDGE_EBT_AMONG=m ++CONFIG_BRIDGE_EBT_ARP=m ++CONFIG_BRIDGE_EBT_IP=m ++CONFIG_BRIDGE_EBT_LIMIT=m ++CONFIG_BRIDGE_EBT_MARK=m ++CONFIG_BRIDGE_EBT_PKTTYPE=m ++CONFIG_BRIDGE_EBT_STP=m ++CONFIG_BRIDGE_EBT_VLAN=m ++CONFIG_BRIDGE_EBT_ARPREPLY=m ++CONFIG_BRIDGE_EBT_DNAT=m ++CONFIG_BRIDGE_EBT_MARK_T=m ++CONFIG_BRIDGE_EBT_REDIRECT=m ++CONFIG_BRIDGE_EBT_SNAT=m ++CONFIG_BRIDGE_EBT_LOG=m ++CONFIG_IP_SCTP=m ++CONFIG_ATM=m ++CONFIG_ATM_CLIP=m ++CONFIG_ATM_LANE=m ++CONFIG_ATM_BR2684=m ++CONFIG_BRIDGE=m ++CONFIG_VLAN_8021Q=m ++CONFIG_NET_SCHED=y ++CONFIG_NET_SCH_CBQ=m ++CONFIG_NET_SCH_HTB=m ++CONFIG_NET_SCH_HFSC=m ++CONFIG_NET_SCH_ATM=m ++CONFIG_NET_SCH_PRIO=m ++CONFIG_NET_SCH_RED=m ++CONFIG_NET_SCH_SFQ=m ++CONFIG_NET_SCH_TEQL=m ++CONFIG_NET_SCH_TBF=m ++CONFIG_NET_SCH_GRED=m ++CONFIG_NET_SCH_DSMARK=m ++CONFIG_NET_SCH_NETEM=m ++CONFIG_NET_CLS_TCINDEX=m ++CONFIG_NET_CLS_ROUTE4=m ++CONFIG_NET_CLS_FW=m ++CONFIG_NET_CLS_U32=m ++CONFIG_CLS_U32_PERF=y ++CONFIG_NET_CLS_RSVP=m ++CONFIG_NET_CLS_RSVP6=m ++CONFIG_NET_CLS_IND=y ++CONFIG_BT=m ++CONFIG_BT_RFCOMM=m ++CONFIG_BT_RFCOMM_TTY=y ++CONFIG_BT_BNEP=m ++CONFIG_BT_BNEP_MC_FILTER=y ++CONFIG_BT_BNEP_PROTO_FILTER=y ++CONFIG_BT_HIDP=m ++CONFIG_BT_HCIUART=m ++CONFIG_BT_HCIUART_H4=y ++CONFIG_BT_HCIUART_BCSP=y ++CONFIG_BT_HCIBCM203X=m ++CONFIG_BT_HCIBFUSB=m ++CONFIG_BT_HCIVHCI=m ++CONFIG_UEVENT_HELPER_PATH="/sbin/hotplug" ++CONFIG_MTD=y ++CONFIG_MTD_BLOCK=y ++CONFIG_MTD_CFI=y ++CONFIG_MTD_CFI_AMDSTD=y ++CONFIG_MTD_COMPLEX_MAPPINGS=y ++CONFIG_MTD_PHYSMAP_OF=y ++CONFIG_BLK_DEV_LOOP=m ++CONFIG_BLK_DEV_CRYPTOLOOP=m ++CONFIG_BLK_DEV_NBD=m ++CONFIG_BLK_DEV_RAM=y ++CONFIG_BLK_DEV_RAM_SIZE=16384 ++CONFIG_SCSI=m ++CONFIG_BLK_DEV_SD=m ++CONFIG_CHR_DEV_ST=m ++CONFIG_CHR_DEV_OSST=m ++CONFIG_BLK_DEV_SR=m ++CONFIG_BLK_DEV_SR_VENDOR=y ++CONFIG_CHR_DEV_SG=m ++CONFIG_SCSI_CONSTANTS=y ++CONFIG_SCSI_LOGGING=y ++CONFIG_SCSI_ISCSI_ATTRS=m ++CONFIG_BLK_DEV_3W_XXXX_RAID=m ++CONFIG_SCSI_3W_9XXX=m ++CONFIG_SCSI_ACARD=m ++CONFIG_SCSI_AACRAID=m ++CONFIG_SCSI_AIC7XXX=m ++CONFIG_AIC7XXX_CMDS_PER_DEVICE=4 ++CONFIG_AIC7XXX_RESET_DELAY_MS=15000 ++# CONFIG_AIC7XXX_DEBUG_ENABLE is not set ++# CONFIG_AIC7XXX_REG_PRETTY_PRINT is not set ++CONFIG_SCSI_AIC79XX=m ++CONFIG_AIC79XX_CMDS_PER_DEVICE=4 ++CONFIG_AIC79XX_RESET_DELAY_MS=15000 ++# CONFIG_AIC79XX_DEBUG_ENABLE is not set ++# CONFIG_AIC79XX_REG_PRETTY_PRINT is not set ++CONFIG_SCSI_ARCMSR=m ++CONFIG_MEGARAID_NEWGEN=y ++CONFIG_MEGARAID_MM=m ++CONFIG_MEGARAID_MAILBOX=m ++CONFIG_MEGARAID_SAS=m ++CONFIG_SCSI_GDTH=m ++CONFIG_SCSI_IPS=m ++CONFIG_SCSI_INITIO=m ++CONFIG_SCSI_SYM53C8XX_2=m ++CONFIG_SCSI_QLOGIC_1280=m ++CONFIG_NETDEVICES=y ++CONFIG_BONDING=m ++CONFIG_DUMMY=m ++CONFIG_NETCONSOLE=m ++CONFIG_TUN=m ++# CONFIG_ATM_DRIVERS is not set ++CONFIG_MV643XX_ETH=y ++CONFIG_VITESSE_PHY=y ++CONFIG_INPUT_EVDEV=y ++# CONFIG_INPUT_KEYBOARD is not set ++# CONFIG_INPUT_MOUSE is not set ++CONFIG_INPUT_MISC=y ++CONFIG_INPUT_UINPUT=m ++# CONFIG_SERIO is not set ++# CONFIG_LEGACY_PTYS is not set ++CONFIG_SERIAL_NONSTANDARD=y ++CONFIG_SERIAL_MPSC=y ++CONFIG_SERIAL_MPSC_CONSOLE=y ++CONFIG_NVRAM=m ++CONFIG_RAW_DRIVER=y ++CONFIG_MAX_RAW_DEVS=8192 ++CONFIG_I2C=m ++CONFIG_I2C_CHARDEV=m ++CONFIG_I2C_MV64XXX=m ++CONFIG_HWMON=m ++CONFIG_SENSORS_ADM1021=m ++CONFIG_SENSORS_ADM1025=m ++CONFIG_SENSORS_ADM1026=m ++CONFIG_SENSORS_ADM1031=m ++CONFIG_SENSORS_DS1621=m ++CONFIG_SENSORS_GL518SM=m ++CONFIG_SENSORS_MAX1619=m ++CONFIG_SENSORS_LM75=m ++CONFIG_SENSORS_LM77=m ++CONFIG_SENSORS_LM78=m ++CONFIG_SENSORS_LM80=m ++CONFIG_SENSORS_LM83=m ++CONFIG_SENSORS_LM85=m ++CONFIG_SENSORS_LM87=m ++CONFIG_SENSORS_LM90=m ++CONFIG_SENSORS_PCF8591=m ++CONFIG_SENSORS_VIA686A=m ++CONFIG_SENSORS_W83781D=m ++CONFIG_SENSORS_W83L785TS=m ++CONFIG_WATCHDOG=y ++CONFIG_SOFT_WATCHDOG=m ++CONFIG_PCIPCWATCHDOG=m ++CONFIG_WDTPCI=m ++CONFIG_USBPCWATCHDOG=m ++# CONFIG_VGA_CONSOLE is not set ++CONFIG_USB=m ++CONFIG_USB_MON=m ++CONFIG_USB_EHCI_HCD=m ++CONFIG_USB_EHCI_ROOT_HUB_TT=y ++CONFIG_USB_OHCI_HCD=m ++CONFIG_USB_OHCI_HCD_PPC_OF_BE=y ++CONFIG_USB_UHCI_HCD=m ++CONFIG_USB_ACM=m ++CONFIG_USB_PRINTER=m ++CONFIG_USB_STORAGE=m ++CONFIG_USB_STORAGE_DATAFAB=m ++CONFIG_USB_STORAGE_FREECOM=m ++CONFIG_USB_STORAGE_ISD200=m ++CONFIG_USB_STORAGE_SDDR09=m ++CONFIG_USB_STORAGE_SDDR55=m ++CONFIG_USB_STORAGE_JUMPSHOT=m ++CONFIG_USB_MDC800=m ++CONFIG_USB_MICROTEK=m ++CONFIG_USB_SERIAL=m ++CONFIG_USB_SERIAL_GENERIC=y ++CONFIG_USB_SERIAL_BELKIN=m ++CONFIG_USB_SERIAL_WHITEHEAT=m ++CONFIG_USB_SERIAL_DIGI_ACCELEPORT=m ++CONFIG_USB_SERIAL_EMPEG=m ++CONFIG_USB_SERIAL_FTDI_SIO=m ++CONFIG_USB_SERIAL_VISOR=m ++CONFIG_USB_SERIAL_IPAQ=m ++CONFIG_USB_SERIAL_IR=m ++CONFIG_USB_SERIAL_EDGEPORT=m ++CONFIG_USB_SERIAL_EDGEPORT_TI=m ++CONFIG_USB_SERIAL_KEYSPAN_PDA=m ++CONFIG_USB_SERIAL_KEYSPAN=m ++CONFIG_USB_SERIAL_KLSI=m ++CONFIG_USB_SERIAL_KOBIL_SCT=m ++CONFIG_USB_SERIAL_MCT_U232=m ++CONFIG_USB_SERIAL_PL2303=m ++CONFIG_USB_SERIAL_SAFE=m ++CONFIG_USB_SERIAL_SAFE_PADDED=y ++CONFIG_USB_SERIAL_CYBERJACK=m ++CONFIG_USB_SERIAL_XIRCOM=m ++CONFIG_USB_SERIAL_OMNINET=m ++CONFIG_USB_EMI62=m ++CONFIG_USB_RIO500=m ++CONFIG_USB_LEGOTOWER=m ++CONFIG_USB_LCD=m ++CONFIG_USB_LED=m ++CONFIG_USB_TEST=m ++CONFIG_USB_ATM=m ++CONFIG_USB_SPEEDTOUCH=m ++CONFIG_INFINIBAND=m ++CONFIG_INFINIBAND_USER_MAD=m ++CONFIG_INFINIBAND_USER_ACCESS=m ++CONFIG_INFINIBAND_MTHCA=m ++CONFIG_INFINIBAND_IPOIB=m ++CONFIG_INFINIBAND_IPOIB_CM=y ++CONFIG_INFINIBAND_SRP=m ++CONFIG_DMADEVICES=y ++CONFIG_EXT4_FS=m ++CONFIG_EXT4_FS_POSIX_ACL=y ++CONFIG_EXT4_FS_SECURITY=y ++CONFIG_QUOTA=y ++CONFIG_QFMT_V2=y ++CONFIG_AUTOFS4_FS=m ++CONFIG_UDF_FS=m ++CONFIG_MSDOS_FS=m ++CONFIG_VFAT_FS=m ++CONFIG_FAT_DEFAULT_IOCHARSET="ascii" ++CONFIG_PROC_KCORE=y ++CONFIG_TMPFS=y ++CONFIG_HFS_FS=m ++CONFIG_HFSPLUS_FS=m ++CONFIG_JFFS2_FS=y ++CONFIG_CRAMFS=m ++CONFIG_VXFS_FS=m ++CONFIG_NFS_FS=y ++CONFIG_NFS_V3_ACL=y ++CONFIG_NFS_V4=y ++CONFIG_ROOT_NFS=y ++CONFIG_CIFS=m ++CONFIG_CIFS_XATTR=y ++CONFIG_CIFS_POSIX=y ++CONFIG_NLS=y ++CONFIG_NLS_DEFAULT="utf8" ++CONFIG_NLS_CODEPAGE_437=y ++CONFIG_NLS_CODEPAGE_737=m ++CONFIG_NLS_CODEPAGE_775=m ++CONFIG_NLS_CODEPAGE_850=m ++CONFIG_NLS_CODEPAGE_852=m ++CONFIG_NLS_CODEPAGE_855=m ++CONFIG_NLS_CODEPAGE_857=m ++CONFIG_NLS_CODEPAGE_860=m ++CONFIG_NLS_CODEPAGE_861=m ++CONFIG_NLS_CODEPAGE_862=m ++CONFIG_NLS_CODEPAGE_863=m ++CONFIG_NLS_CODEPAGE_864=m ++CONFIG_NLS_CODEPAGE_865=m ++CONFIG_NLS_CODEPAGE_866=m ++CONFIG_NLS_CODEPAGE_869=m ++CONFIG_NLS_CODEPAGE_936=m ++CONFIG_NLS_CODEPAGE_950=m ++CONFIG_NLS_CODEPAGE_932=m ++CONFIG_NLS_CODEPAGE_949=m ++CONFIG_NLS_CODEPAGE_874=m ++CONFIG_NLS_ISO8859_8=m ++CONFIG_NLS_CODEPAGE_1250=m ++CONFIG_NLS_CODEPAGE_1251=m ++CONFIG_NLS_ASCII=y ++CONFIG_NLS_ISO8859_1=m ++CONFIG_NLS_ISO8859_2=m ++CONFIG_NLS_ISO8859_3=m ++CONFIG_NLS_ISO8859_4=m ++CONFIG_NLS_ISO8859_5=m ++CONFIG_NLS_ISO8859_6=m ++CONFIG_NLS_ISO8859_7=m ++CONFIG_NLS_ISO8859_9=m ++CONFIG_NLS_ISO8859_13=m ++CONFIG_NLS_ISO8859_14=m ++CONFIG_NLS_ISO8859_15=m ++CONFIG_NLS_KOI8_R=m ++CONFIG_NLS_KOI8_U=m ++CONFIG_CRC_CCITT=m ++CONFIG_CRC_T10DIF=m ++CONFIG_DEBUG_INFO=y ++CONFIG_MAGIC_SYSRQ=y ++CONFIG_DEBUG_KERNEL=y ++CONFIG_DEBUG_STACK_USAGE=y ++CONFIG_DEBUG_HIGHMEM=y ++CONFIG_DEBUG_STACKOVERFLOW=y ++CONFIG_DETECT_HUNG_TASK=y ++CONFIG_DEBUG_SPINLOCK=y ++CONFIG_BOOTX_TEXT=y ++CONFIG_PPC_EARLY_DEBUG=y ++CONFIG_SECURITY=y ++CONFIG_SECURITY_NETWORK=y ++CONFIG_SECURITY_SELINUX=y ++CONFIG_SECURITY_SELINUX_BOOTPARAM=y ++CONFIG_SECURITY_SELINUX_DISABLE=y ++CONFIG_CRYPTO_HMAC=y ++CONFIG_CRYPTO_MICHAEL_MIC=m ++CONFIG_CRYPTO_SHA1=y ++CONFIG_CRYPTO_SHA512=m ++CONFIG_CRYPTO_WP512=m ++CONFIG_CRYPTO_BLOWFISH=m ++CONFIG_CRYPTO_CAST6=m ++CONFIG_CRYPTO_KHAZAD=m ++CONFIG_CRYPTO_SERPENT=m ++CONFIG_CRYPTO_TEA=m ++CONFIG_CRYPTO_TWOFISH=m +diff --git a/arch/powerpc/configs/ppc6xx_defconfig b/arch/powerpc/configs/ppc6xx_defconfig +index 9dca4cffa623..09d38c3e59a5 100644 +--- a/arch/powerpc/configs/ppc6xx_defconfig ++++ b/arch/powerpc/configs/ppc6xx_defconfig +@@ -74,7 +74,7 @@ CONFIG_QE_GPIO=y + CONFIG_MCU_MPC8349EMITX=y + CONFIG_HIGHMEM=y + CONFIG_HZ_1000=y +-CONFIG_PREEMPT_VOLUNTARY=y ++CONFIG_PREEMPT=y + CONFIG_BINFMT_MISC=y + CONFIG_HIBERNATION=y + CONFIG_PM_DEBUG=y +diff --git a/arch/powerpc/platforms/cell/spufs/sched.c b/arch/powerpc/platforms/cell/spufs/sched.c +index f18d5067cd0f..fe489fc01c73 100644 +--- a/arch/powerpc/platforms/cell/spufs/sched.c ++++ b/arch/powerpc/platforms/cell/spufs/sched.c +@@ -51,11 +51,6 @@ static struct task_struct *spusched_task; + static struct timer_list spusched_timer; + static struct timer_list spuloadavg_timer; + +-/* +- * Priority of a normal, non-rt, non-niced'd process (aka nice level 0). +- */ +-#define NORMAL_PRIO 120 +- + /* + * Frequency of the spu scheduler tick. By default we do one SPU scheduler + * tick for every 10 CPU scheduler ticks. +diff --git a/arch/score/configs/spct6600_defconfig b/arch/score/configs/spct6600_defconfig +new file mode 100644 +index 000000000000..46434ca1fa10 +--- /dev/null ++++ b/arch/score/configs/spct6600_defconfig +@@ -0,0 +1,84 @@ ++CONFIG_HZ_100=y ++CONFIG_PREEMPT=y ++CONFIG_EXPERIMENTAL=y ++# CONFIG_LOCALVERSION_AUTO is not set ++CONFIG_SYSVIPC=y ++CONFIG_POSIX_MQUEUE=y ++CONFIG_BSD_PROCESS_ACCT=y ++CONFIG_LOG_BUF_SHIFT=12 ++CONFIG_SYSFS_DEPRECATED_V2=y ++CONFIG_BLK_DEV_INITRD=y ++# CONFIG_CC_OPTIMIZE_FOR_SIZE is not set ++CONFIG_EXPERT=y ++# CONFIG_KALLSYMS is not set ++# CONFIG_HOTPLUG is not set ++CONFIG_SLAB=y ++CONFIG_MODULES=y ++CONFIG_MODULE_FORCE_LOAD=y ++CONFIG_MODULE_UNLOAD=y ++CONFIG_MODULE_FORCE_UNLOAD=y ++# CONFIG_BLK_DEV_BSG is not set ++CONFIG_BINFMT_MISC=y ++CONFIG_NET=y ++CONFIG_UNIX=y ++CONFIG_NET_KEY=y ++CONFIG_INET=y ++CONFIG_IP_MULTICAST=y ++CONFIG_ARPD=y ++# CONFIG_INET_LRO is not set ++# CONFIG_IPV6 is not set ++# CONFIG_STANDALONE is not set ++# CONFIG_PREVENT_FIRMWARE_BUILD is not set ++CONFIG_BLK_DEV_LOOP=y ++CONFIG_BLK_DEV_CRYPTOLOOP=y ++CONFIG_BLK_DEV_RAM=y ++CONFIG_BLK_DEV_RAM_COUNT=1 ++# CONFIG_MISC_DEVICES is not set ++CONFIG_NETDEVICES=y ++# CONFIG_NETDEV_1000 is not set ++# CONFIG_NETDEV_10000 is not set ++# CONFIG_INPUT_MOUSEDEV is not set ++# CONFIG_INPUT_KEYBOARD is not set ++# CONFIG_INPUT_MOUSE is not set ++# CONFIG_SERIO is not set ++CONFIG_SERIAL_NONSTANDARD=y ++CONFIG_STALDRV=y ++# CONFIG_HW_RANDOM is not set ++CONFIG_RAW_DRIVER=y ++CONFIG_MAX_RAW_DEVS=8192 ++# CONFIG_HWMON is not set ++# CONFIG_VGA_CONSOLE is not set ++# CONFIG_HID_SUPPORT is not set ++# CONFIG_USB_SUPPORT is not set ++CONFIG_EXT2_FS=y ++CONFIG_EXT2_FS_XATTR=y ++CONFIG_EXT2_FS_POSIX_ACL=y ++CONFIG_EXT3_FS=y ++# CONFIG_EXT3_DEFAULTS_TO_ORDERED is not set ++CONFIG_EXT3_FS_POSIX_ACL=y ++CONFIG_AUTOFS_FS=y ++CONFIG_AUTOFS4_FS=y ++CONFIG_PROC_KCORE=y ++# CONFIG_PROC_PAGE_MONITOR is not set ++CONFIG_TMPFS=y ++CONFIG_TMPFS_POSIX_ACL=y ++CONFIG_NFS_FS=y ++CONFIG_NFS_V3=y ++CONFIG_NFS_V3_ACL=y ++CONFIG_NFS_V4=y ++CONFIG_NFSD=y ++CONFIG_NFSD_V3_ACL=y ++CONFIG_NFSD_V4=y ++# CONFIG_RCU_CPU_STALL_DETECTOR is not set ++CONFIG_SECURITY=y ++CONFIG_SECURITY_NETWORK=y ++CONFIG_CRYPTO_NULL=y ++CONFIG_CRYPTO_CRYPTD=y ++CONFIG_CRYPTO_SEQIV=y ++CONFIG_CRYPTO_MD4=y ++CONFIG_CRYPTO_MICHAEL_MIC=y ++# CONFIG_CRYPTO_ANSI_CPRNG is not set ++# CONFIG_CRYPTO_HW is not set ++CONFIG_CRC_CCITT=y ++CONFIG_CRC16=y ++CONFIG_LIBCRC32C=y +diff --git a/arch/sh/configs/se7712_defconfig b/arch/sh/configs/se7712_defconfig +index 9a527f978106..5895f2cc726e 100644 +--- a/arch/sh/configs/se7712_defconfig ++++ b/arch/sh/configs/se7712_defconfig +@@ -23,7 +23,7 @@ CONFIG_FLATMEM_MANUAL=y + CONFIG_SH_SOLUTION_ENGINE=y + CONFIG_SH_PCLK_FREQ=66666666 + CONFIG_HEARTBEAT=y +-CONFIG_PREEMPT_VOLUNTARY=y ++CONFIG_PREEMPT=y + CONFIG_CMDLINE_OVERWRITE=y + CONFIG_CMDLINE="console=ttySC0,115200 root=/dev/sda1" + CONFIG_NET=y +diff --git a/arch/sh/configs/se7721_defconfig b/arch/sh/configs/se7721_defconfig +index 3b0e1eb6e874..e296a2cd9903 100644 +--- a/arch/sh/configs/se7721_defconfig ++++ b/arch/sh/configs/se7721_defconfig +@@ -23,7 +23,7 @@ CONFIG_FLATMEM_MANUAL=y + CONFIG_SH_7721_SOLUTION_ENGINE=y + CONFIG_SH_PCLK_FREQ=33333333 + CONFIG_HEARTBEAT=y +-CONFIG_PREEMPT_VOLUNTARY=y ++CONFIG_PREEMPT=y + CONFIG_CMDLINE_OVERWRITE=y + CONFIG_CMDLINE="console=ttySC0,115200 root=/dev/sda2" + CONFIG_NET=y +diff --git a/arch/sh/configs/titan_defconfig b/arch/sh/configs/titan_defconfig +index 4ec961ace688..a03a1ad670a0 100644 +--- a/arch/sh/configs/titan_defconfig ++++ b/arch/sh/configs/titan_defconfig +@@ -20,7 +20,7 @@ CONFIG_SH_TITAN=y + CONFIG_SH_PCLK_FREQ=30000000 + CONFIG_SH_DMA=y + CONFIG_SH_DMA_API=y +-CONFIG_PREEMPT_VOLUNTARY=y ++CONFIG_PREEMPT=y + CONFIG_CMDLINE_OVERWRITE=y + CONFIG_CMDLINE="console=ttySC1,38400N81 root=/dev/nfs ip=:::::eth1:autoconf rw" + CONFIG_PCI=y +diff --git a/arch/sparc/configs/sparc64_defconfig b/arch/sparc/configs/sparc64_defconfig +index 6c325d53a20a..98d4ef3d76cf 100644 +--- a/arch/sparc/configs/sparc64_defconfig ++++ b/arch/sparc/configs/sparc64_defconfig +@@ -22,7 +22,7 @@ CONFIG_NO_HZ=y + CONFIG_HIGH_RES_TIMERS=y + CONFIG_NUMA=y + CONFIG_DEFAULT_MMAP_MIN_ADDR=8192 +-CONFIG_PREEMPT_VOLUNTARY=y ++CONFIG_PREEMPT=y + CONFIG_SUN_LDOMS=y + CONFIG_PCI=y + CONFIG_PCI_MSI=y +diff --git a/arch/tile/configs/tilegx_defconfig b/arch/tile/configs/tilegx_defconfig +new file mode 100644 +index 000000000000..939c63ba7e6e +--- /dev/null ++++ b/arch/tile/configs/tilegx_defconfig +@@ -0,0 +1,411 @@ ++CONFIG_TILEGX=y ++CONFIG_SYSVIPC=y ++CONFIG_POSIX_MQUEUE=y ++CONFIG_FHANDLE=y ++CONFIG_AUDIT=y ++CONFIG_NO_HZ=y ++CONFIG_BSD_PROCESS_ACCT=y ++CONFIG_BSD_PROCESS_ACCT_V3=y ++CONFIG_TASKSTATS=y ++CONFIG_TASK_DELAY_ACCT=y ++CONFIG_TASK_XACCT=y ++CONFIG_TASK_IO_ACCOUNTING=y ++CONFIG_LOG_BUF_SHIFT=19 ++CONFIG_CGROUPS=y ++CONFIG_CGROUP_DEBUG=y ++CONFIG_CGROUP_DEVICE=y ++CONFIG_CPUSETS=y ++CONFIG_CGROUP_CPUACCT=y ++CONFIG_CGROUP_SCHED=y ++CONFIG_RT_GROUP_SCHED=y ++CONFIG_BLK_CGROUP=y ++CONFIG_NAMESPACES=y ++CONFIG_RELAY=y ++CONFIG_BLK_DEV_INITRD=y ++CONFIG_RD_XZ=y ++CONFIG_SYSCTL_SYSCALL=y ++CONFIG_EMBEDDED=y ++# CONFIG_COMPAT_BRK is not set ++CONFIG_PROFILING=y ++CONFIG_KPROBES=y ++CONFIG_MODULES=y ++CONFIG_MODULE_FORCE_LOAD=y ++CONFIG_MODULE_UNLOAD=y ++CONFIG_BLK_DEV_INTEGRITY=y ++CONFIG_PARTITION_ADVANCED=y ++CONFIG_OSF_PARTITION=y ++CONFIG_AMIGA_PARTITION=y ++CONFIG_MAC_PARTITION=y ++CONFIG_BSD_DISKLABEL=y ++CONFIG_MINIX_SUBPARTITION=y ++CONFIG_SOLARIS_X86_PARTITION=y ++CONFIG_UNIXWARE_DISKLABEL=y ++CONFIG_SGI_PARTITION=y ++CONFIG_SUN_PARTITION=y ++CONFIG_KARMA_PARTITION=y ++CONFIG_CFQ_GROUP_IOSCHED=y ++CONFIG_NR_CPUS=100 ++CONFIG_HZ_100=y ++# CONFIG_COMPACTION is not set ++CONFIG_PREEMPT=y ++CONFIG_TILE_PCI_IO=y ++CONFIG_PCI_DEBUG=y ++# CONFIG_CORE_DUMP_DEFAULT_ELF_HEADERS is not set ++CONFIG_BINFMT_MISC=y ++CONFIG_NET=y ++CONFIG_PACKET=y ++CONFIG_UNIX=y ++CONFIG_XFRM_USER=y ++CONFIG_XFRM_SUB_POLICY=y ++CONFIG_XFRM_STATISTICS=y ++CONFIG_NET_KEY=m ++CONFIG_NET_KEY_MIGRATE=y ++CONFIG_INET=y ++CONFIG_IP_MULTICAST=y ++CONFIG_IP_ADVANCED_ROUTER=y ++CONFIG_IP_MULTIPLE_TABLES=y ++CONFIG_IP_ROUTE_MULTIPATH=y ++CONFIG_IP_ROUTE_VERBOSE=y ++CONFIG_NET_IPIP=m ++CONFIG_IP_MROUTE=y ++CONFIG_IP_PIMSM_V1=y ++CONFIG_IP_PIMSM_V2=y ++CONFIG_SYN_COOKIES=y ++CONFIG_INET_AH=m ++CONFIG_INET_ESP=m ++CONFIG_INET_IPCOMP=m ++CONFIG_INET_XFRM_MODE_TRANSPORT=m ++CONFIG_INET_XFRM_MODE_TUNNEL=m ++CONFIG_INET_XFRM_MODE_BEET=m ++CONFIG_INET_DIAG=m ++CONFIG_TCP_CONG_ADVANCED=y ++CONFIG_TCP_CONG_HSTCP=m ++CONFIG_TCP_CONG_HYBLA=m ++CONFIG_TCP_CONG_SCALABLE=m ++CONFIG_TCP_CONG_LP=m ++CONFIG_TCP_CONG_VENO=m ++CONFIG_TCP_CONG_YEAH=m ++CONFIG_TCP_CONG_ILLINOIS=m ++CONFIG_TCP_MD5SIG=y ++CONFIG_IPV6=y ++CONFIG_IPV6_ROUTER_PREF=y ++CONFIG_IPV6_ROUTE_INFO=y ++CONFIG_IPV6_OPTIMISTIC_DAD=y ++CONFIG_INET6_AH=m ++CONFIG_INET6_ESP=m ++CONFIG_INET6_IPCOMP=m ++CONFIG_IPV6_MIP6=m ++CONFIG_INET6_XFRM_MODE_TRANSPORT=m ++CONFIG_INET6_XFRM_MODE_TUNNEL=m ++CONFIG_INET6_XFRM_MODE_BEET=m ++CONFIG_INET6_XFRM_MODE_ROUTEOPTIMIZATION=m ++CONFIG_IPV6_SIT=m ++CONFIG_IPV6_TUNNEL=m ++CONFIG_IPV6_MULTIPLE_TABLES=y ++CONFIG_IPV6_MROUTE=y ++CONFIG_IPV6_PIMSM_V2=y ++CONFIG_NETLABEL=y ++CONFIG_RDS=m ++CONFIG_RDS_TCP=m ++CONFIG_BRIDGE=m ++CONFIG_VLAN_8021Q=m ++CONFIG_VLAN_8021Q_GVRP=y ++CONFIG_PHONET=m ++CONFIG_NET_SCHED=y ++CONFIG_NET_SCH_CBQ=m ++CONFIG_NET_SCH_HTB=m ++CONFIG_NET_SCH_HFSC=m ++CONFIG_NET_SCH_PRIO=m ++CONFIG_NET_SCH_MULTIQ=m ++CONFIG_NET_SCH_RED=m ++CONFIG_NET_SCH_SFQ=m ++CONFIG_NET_SCH_TEQL=m ++CONFIG_NET_SCH_TBF=m ++CONFIG_NET_SCH_GRED=m ++CONFIG_NET_SCH_DSMARK=m ++CONFIG_NET_SCH_NETEM=m ++CONFIG_NET_SCH_DRR=m ++CONFIG_NET_SCH_INGRESS=m ++CONFIG_NET_CLS_BASIC=m ++CONFIG_NET_CLS_TCINDEX=m ++CONFIG_NET_CLS_ROUTE4=m ++CONFIG_NET_CLS_FW=m ++CONFIG_NET_CLS_U32=m ++CONFIG_CLS_U32_PERF=y ++CONFIG_CLS_U32_MARK=y ++CONFIG_NET_CLS_RSVP=m ++CONFIG_NET_CLS_RSVP6=m ++CONFIG_NET_CLS_FLOW=m ++CONFIG_NET_CLS_CGROUP=y ++CONFIG_NET_EMATCH=y ++CONFIG_NET_EMATCH_CMP=m ++CONFIG_NET_EMATCH_NBYTE=m ++CONFIG_NET_EMATCH_U32=m ++CONFIG_NET_EMATCH_META=m ++CONFIG_NET_EMATCH_TEXT=m ++CONFIG_NET_CLS_ACT=y ++CONFIG_NET_ACT_POLICE=m ++CONFIG_NET_ACT_GACT=m ++CONFIG_GACT_PROB=y ++CONFIG_NET_ACT_MIRRED=m ++CONFIG_NET_ACT_NAT=m ++CONFIG_NET_ACT_PEDIT=m ++CONFIG_NET_ACT_SIMP=m ++CONFIG_NET_ACT_SKBEDIT=m ++CONFIG_NET_CLS_IND=y ++CONFIG_DCB=y ++CONFIG_DNS_RESOLVER=y ++# CONFIG_WIRELESS is not set ++CONFIG_UEVENT_HELPER_PATH="/sbin/hotplug" ++CONFIG_DEVTMPFS=y ++CONFIG_DEVTMPFS_MOUNT=y ++CONFIG_CONNECTOR=y ++CONFIG_BLK_DEV_LOOP=y ++CONFIG_BLK_DEV_CRYPTOLOOP=m ++CONFIG_BLK_DEV_SX8=m ++CONFIG_BLK_DEV_RAM=y ++CONFIG_BLK_DEV_RAM_SIZE=16384 ++CONFIG_ATA_OVER_ETH=m ++CONFIG_RAID_ATTRS=m ++CONFIG_BLK_DEV_SD=y ++CONFIG_SCSI_CONSTANTS=y ++CONFIG_SCSI_LOGGING=y ++CONFIG_SCSI_SAS_ATA=y ++CONFIG_ISCSI_TCP=m ++CONFIG_SCSI_MVSAS=y ++# CONFIG_SCSI_MVSAS_DEBUG is not set ++CONFIG_SCSI_MVSAS_TASKLET=y ++CONFIG_ATA=y ++CONFIG_SATA_AHCI=y ++CONFIG_SATA_SIL24=y ++# CONFIG_ATA_SFF is not set ++CONFIG_MD=y ++CONFIG_BLK_DEV_MD=y ++CONFIG_MD_LINEAR=m ++CONFIG_MD_RAID0=m ++CONFIG_MD_RAID1=m ++CONFIG_MD_RAID10=m ++CONFIG_MD_RAID456=m ++CONFIG_MD_FAULTY=m ++CONFIG_BLK_DEV_DM=m ++CONFIG_DM_DEBUG=y ++CONFIG_DM_CRYPT=m ++CONFIG_DM_SNAPSHOT=m ++CONFIG_DM_MIRROR=m ++CONFIG_DM_LOG_USERSPACE=m ++CONFIG_DM_ZERO=m ++CONFIG_DM_MULTIPATH=m ++CONFIG_DM_MULTIPATH_QL=m ++CONFIG_DM_MULTIPATH_ST=m ++CONFIG_DM_DELAY=m ++CONFIG_DM_UEVENT=y ++CONFIG_TARGET_CORE=m ++CONFIG_TCM_IBLOCK=m ++CONFIG_TCM_FILEIO=m ++CONFIG_TCM_PSCSI=m ++CONFIG_LOOPBACK_TARGET=m ++CONFIG_ISCSI_TARGET=m ++CONFIG_FUSION=y ++CONFIG_FUSION_SAS=y ++CONFIG_NETDEVICES=y ++CONFIG_BONDING=m ++CONFIG_DUMMY=m ++CONFIG_IFB=m ++CONFIG_MACVLAN=m ++CONFIG_MACVTAP=m ++CONFIG_NETCONSOLE=m ++CONFIG_NETCONSOLE_DYNAMIC=y ++CONFIG_TUN=y ++CONFIG_VETH=m ++CONFIG_NET_DSA_MV88E6060=y ++CONFIG_NET_DSA_MV88E6XXX=y ++CONFIG_SKY2=y ++CONFIG_PTP_1588_CLOCK_TILEGX=y ++# CONFIG_WLAN is not set ++# CONFIG_INPUT_MOUSEDEV is not set ++# CONFIG_INPUT_KEYBOARD is not set ++# CONFIG_INPUT_MOUSE is not set ++# CONFIG_SERIO is not set ++# CONFIG_VT is not set ++# CONFIG_LEGACY_PTYS is not set ++CONFIG_SERIAL_TILEGX=y ++CONFIG_HW_RANDOM=y ++CONFIG_HW_RANDOM_TIMERIOMEM=m ++CONFIG_I2C=y ++CONFIG_I2C_CHARDEV=y ++# CONFIG_HWMON is not set ++CONFIG_WATCHDOG=y ++CONFIG_WATCHDOG_NOWAYOUT=y ++# CONFIG_VGA_ARB is not set ++CONFIG_DRM=m ++CONFIG_DRM_TDFX=m ++CONFIG_DRM_R128=m ++CONFIG_DRM_MGA=m ++CONFIG_DRM_VIA=m ++CONFIG_DRM_SAVAGE=m ++CONFIG_USB=y ++CONFIG_USB_EHCI_HCD=y ++CONFIG_USB_OHCI_HCD=y ++CONFIG_USB_STORAGE=y ++CONFIG_EDAC=y ++CONFIG_RTC_CLASS=y ++CONFIG_RTC_DRV_TILE=y ++CONFIG_EXT2_FS=y ++CONFIG_EXT2_FS_XATTR=y ++CONFIG_EXT2_FS_POSIX_ACL=y ++CONFIG_EXT2_FS_SECURITY=y ++CONFIG_EXT2_FS_XIP=y ++CONFIG_EXT3_FS=y ++CONFIG_EXT3_FS_POSIX_ACL=y ++CONFIG_EXT3_FS_SECURITY=y ++CONFIG_EXT4_FS=y ++CONFIG_EXT4_FS_POSIX_ACL=y ++CONFIG_EXT4_FS_SECURITY=y ++CONFIG_XFS_FS=y ++CONFIG_XFS_QUOTA=y ++CONFIG_XFS_POSIX_ACL=y ++CONFIG_GFS2_FS=m ++CONFIG_GFS2_FS_LOCKING_DLM=y ++CONFIG_BTRFS_FS=m ++CONFIG_BTRFS_FS_POSIX_ACL=y ++CONFIG_QUOTA=y ++CONFIG_QUOTA_NETLINK_INTERFACE=y ++# CONFIG_PRINT_QUOTA_WARNING is not set ++CONFIG_QFMT_V2=y ++CONFIG_AUTOFS4_FS=m ++CONFIG_FUSE_FS=y ++CONFIG_CUSE=m ++CONFIG_FSCACHE=m ++CONFIG_FSCACHE_STATS=y ++CONFIG_CACHEFILES=m ++CONFIG_ISO9660_FS=m ++CONFIG_JOLIET=y ++CONFIG_ZISOFS=y ++CONFIG_UDF_FS=m ++CONFIG_MSDOS_FS=m ++CONFIG_VFAT_FS=m ++CONFIG_FAT_DEFAULT_IOCHARSET="ascii" ++CONFIG_PROC_KCORE=y ++CONFIG_TMPFS=y ++CONFIG_TMPFS_POSIX_ACL=y ++CONFIG_HUGETLBFS=y ++CONFIG_ECRYPT_FS=m ++CONFIG_CRAMFS=m ++CONFIG_SQUASHFS=m ++CONFIG_NFS_FS=m ++CONFIG_NFS_V3_ACL=y ++CONFIG_NFS_V4=m ++CONFIG_NFS_V4_1=y ++CONFIG_NFS_FSCACHE=y ++CONFIG_NFSD=m ++CONFIG_NFSD_V3_ACL=y ++CONFIG_NFSD_V4=y ++CONFIG_CIFS=m ++CONFIG_CIFS_STATS=y ++CONFIG_CIFS_WEAK_PW_HASH=y ++CONFIG_CIFS_UPCALL=y ++CONFIG_CIFS_XATTR=y ++CONFIG_CIFS_POSIX=y ++CONFIG_CIFS_DFS_UPCALL=y ++CONFIG_CIFS_FSCACHE=y ++CONFIG_NLS_DEFAULT="utf8" ++CONFIG_NLS_CODEPAGE_437=y ++CONFIG_NLS_CODEPAGE_737=m ++CONFIG_NLS_CODEPAGE_775=m ++CONFIG_NLS_CODEPAGE_850=m ++CONFIG_NLS_CODEPAGE_852=m ++CONFIG_NLS_CODEPAGE_855=m ++CONFIG_NLS_CODEPAGE_857=m ++CONFIG_NLS_CODEPAGE_860=m ++CONFIG_NLS_CODEPAGE_861=m ++CONFIG_NLS_CODEPAGE_862=m ++CONFIG_NLS_CODEPAGE_863=m ++CONFIG_NLS_CODEPAGE_864=m ++CONFIG_NLS_CODEPAGE_865=m ++CONFIG_NLS_CODEPAGE_866=m ++CONFIG_NLS_CODEPAGE_869=m ++CONFIG_NLS_CODEPAGE_936=m ++CONFIG_NLS_CODEPAGE_950=m ++CONFIG_NLS_CODEPAGE_932=m ++CONFIG_NLS_CODEPAGE_949=m ++CONFIG_NLS_CODEPAGE_874=m ++CONFIG_NLS_ISO8859_8=m ++CONFIG_NLS_CODEPAGE_1250=m ++CONFIG_NLS_CODEPAGE_1251=m ++CONFIG_NLS_ASCII=y ++CONFIG_NLS_ISO8859_1=m ++CONFIG_NLS_ISO8859_2=m ++CONFIG_NLS_ISO8859_3=m ++CONFIG_NLS_ISO8859_4=m ++CONFIG_NLS_ISO8859_5=m ++CONFIG_NLS_ISO8859_6=m ++CONFIG_NLS_ISO8859_7=m ++CONFIG_NLS_ISO8859_9=m ++CONFIG_NLS_ISO8859_13=m ++CONFIG_NLS_ISO8859_14=m ++CONFIG_NLS_ISO8859_15=m ++CONFIG_NLS_KOI8_R=m ++CONFIG_NLS_KOI8_U=m ++CONFIG_NLS_UTF8=m ++CONFIG_DLM=m ++CONFIG_DLM_DEBUG=y ++CONFIG_DYNAMIC_DEBUG=y ++CONFIG_DEBUG_INFO=y ++CONFIG_DEBUG_INFO_REDUCED=y ++# CONFIG_ENABLE_WARN_DEPRECATED is not set ++CONFIG_STRIP_ASM_SYMS=y ++CONFIG_DEBUG_FS=y ++CONFIG_HEADERS_CHECK=y ++# CONFIG_FRAME_POINTER is not set ++CONFIG_DEBUG_FORCE_WEAK_PER_CPU=y ++CONFIG_DEBUG_VM=y ++CONFIG_DEBUG_MEMORY_INIT=y ++CONFIG_DEBUG_STACKOVERFLOW=y ++CONFIG_LOCKUP_DETECTOR=y ++CONFIG_SCHEDSTATS=y ++CONFIG_TIMER_STATS=y ++CONFIG_DEBUG_LIST=y ++CONFIG_DEBUG_CREDENTIALS=y ++CONFIG_RCU_CPU_STALL_TIMEOUT=60 ++CONFIG_ASYNC_RAID6_TEST=m ++CONFIG_KGDB=y ++CONFIG_SECURITY=y ++CONFIG_SECURITYFS=y ++CONFIG_SECURITY_NETWORK=y ++CONFIG_SECURITY_NETWORK_XFRM=y ++CONFIG_SECURITY_SELINUX=y ++CONFIG_SECURITY_SELINUX_BOOTPARAM=y ++CONFIG_SECURITY_SELINUX_DISABLE=y ++CONFIG_CRYPTO_PCRYPT=m ++CONFIG_CRYPTO_CRYPTD=m ++CONFIG_CRYPTO_TEST=m ++CONFIG_CRYPTO_CCM=m ++CONFIG_CRYPTO_GCM=m ++CONFIG_CRYPTO_CTS=m ++CONFIG_CRYPTO_LRW=m ++CONFIG_CRYPTO_PCBC=m ++CONFIG_CRYPTO_XTS=m ++CONFIG_CRYPTO_HMAC=y ++CONFIG_CRYPTO_XCBC=m ++CONFIG_CRYPTO_VMAC=m ++CONFIG_CRYPTO_MICHAEL_MIC=m ++CONFIG_CRYPTO_RMD128=m ++CONFIG_CRYPTO_RMD160=m ++CONFIG_CRYPTO_RMD256=m ++CONFIG_CRYPTO_RMD320=m ++CONFIG_CRYPTO_SHA1=y ++CONFIG_CRYPTO_SHA512=m ++CONFIG_CRYPTO_TGR192=m ++CONFIG_CRYPTO_WP512=m ++CONFIG_CRYPTO_ANUBIS=m ++CONFIG_CRYPTO_BLOWFISH=m ++CONFIG_CRYPTO_CAMELLIA=m ++CONFIG_CRYPTO_CAST5=m ++CONFIG_CRYPTO_CAST6=m ++CONFIG_CRYPTO_FCRYPT=m ++CONFIG_CRYPTO_KHAZAD=m ++CONFIG_CRYPTO_SEED=m ++CONFIG_CRYPTO_SERPENT=m ++CONFIG_CRYPTO_TEA=m ++CONFIG_CRYPTO_TWOFISH=m ++CONFIG_CRYPTO_LZO=m +diff --git a/arch/tile/configs/tilepro_defconfig b/arch/tile/configs/tilepro_defconfig +new file mode 100644 +index 000000000000..e8c4003cbd81 +--- /dev/null ++++ b/arch/tile/configs/tilepro_defconfig +@@ -0,0 +1,524 @@ ++CONFIG_SYSVIPC=y ++CONFIG_POSIX_MQUEUE=y ++CONFIG_AUDIT=y ++CONFIG_NO_HZ=y ++CONFIG_HIGH_RES_TIMERS=y ++CONFIG_BSD_PROCESS_ACCT=y ++CONFIG_BSD_PROCESS_ACCT_V3=y ++CONFIG_TASKSTATS=y ++CONFIG_TASK_DELAY_ACCT=y ++CONFIG_TASK_XACCT=y ++CONFIG_TASK_IO_ACCOUNTING=y ++CONFIG_LOG_BUF_SHIFT=19 ++CONFIG_CGROUPS=y ++CONFIG_CGROUP_DEBUG=y ++CONFIG_CGROUP_DEVICE=y ++CONFIG_CPUSETS=y ++CONFIG_CGROUP_CPUACCT=y ++CONFIG_CGROUP_SCHED=y ++CONFIG_RT_GROUP_SCHED=y ++CONFIG_BLK_CGROUP=y ++CONFIG_NAMESPACES=y ++CONFIG_RELAY=y ++CONFIG_BLK_DEV_INITRD=y ++CONFIG_RD_XZ=y ++CONFIG_SYSCTL_SYSCALL=y ++CONFIG_EMBEDDED=y ++# CONFIG_COMPAT_BRK is not set ++CONFIG_PROFILING=y ++CONFIG_MODULES=y ++CONFIG_MODULE_FORCE_LOAD=y ++CONFIG_MODULE_UNLOAD=y ++CONFIG_BLK_DEV_INTEGRITY=y ++CONFIG_PARTITION_ADVANCED=y ++CONFIG_OSF_PARTITION=y ++CONFIG_AMIGA_PARTITION=y ++CONFIG_MAC_PARTITION=y ++CONFIG_BSD_DISKLABEL=y ++CONFIG_MINIX_SUBPARTITION=y ++CONFIG_SOLARIS_X86_PARTITION=y ++CONFIG_UNIXWARE_DISKLABEL=y ++CONFIG_SGI_PARTITION=y ++CONFIG_SUN_PARTITION=y ++CONFIG_KARMA_PARTITION=y ++CONFIG_CFQ_GROUP_IOSCHED=y ++CONFIG_HZ_100=y ++# CONFIG_COMPACTION is not set ++CONFIG_PREEMPT=y ++CONFIG_PCI_DEBUG=y ++# CONFIG_CORE_DUMP_DEFAULT_ELF_HEADERS is not set ++CONFIG_BINFMT_MISC=y ++CONFIG_NET=y ++CONFIG_PACKET=y ++CONFIG_UNIX=y ++CONFIG_XFRM_USER=y ++CONFIG_XFRM_SUB_POLICY=y ++CONFIG_XFRM_STATISTICS=y ++CONFIG_NET_KEY=m ++CONFIG_NET_KEY_MIGRATE=y ++CONFIG_INET=y ++CONFIG_IP_MULTICAST=y ++CONFIG_IP_ADVANCED_ROUTER=y ++CONFIG_IP_MULTIPLE_TABLES=y ++CONFIG_IP_ROUTE_MULTIPATH=y ++CONFIG_IP_ROUTE_VERBOSE=y ++CONFIG_NET_IPIP=m ++CONFIG_IP_MROUTE=y ++CONFIG_IP_PIMSM_V1=y ++CONFIG_IP_PIMSM_V2=y ++CONFIG_SYN_COOKIES=y ++CONFIG_INET_AH=m ++CONFIG_INET_ESP=m ++CONFIG_INET_IPCOMP=m ++CONFIG_INET_XFRM_MODE_TRANSPORT=m ++CONFIG_INET_XFRM_MODE_TUNNEL=m ++CONFIG_INET_XFRM_MODE_BEET=m ++CONFIG_INET_DIAG=m ++CONFIG_TCP_CONG_ADVANCED=y ++CONFIG_TCP_CONG_HSTCP=m ++CONFIG_TCP_CONG_HYBLA=m ++CONFIG_TCP_CONG_SCALABLE=m ++CONFIG_TCP_CONG_LP=m ++CONFIG_TCP_CONG_VENO=m ++CONFIG_TCP_CONG_YEAH=m ++CONFIG_TCP_CONG_ILLINOIS=m ++CONFIG_TCP_MD5SIG=y ++CONFIG_IPV6=y ++CONFIG_IPV6_ROUTER_PREF=y ++CONFIG_IPV6_ROUTE_INFO=y ++CONFIG_IPV6_OPTIMISTIC_DAD=y ++CONFIG_INET6_AH=m ++CONFIG_INET6_ESP=m ++CONFIG_INET6_IPCOMP=m ++CONFIG_IPV6_MIP6=m ++CONFIG_INET6_XFRM_MODE_TRANSPORT=m ++CONFIG_INET6_XFRM_MODE_TUNNEL=m ++CONFIG_INET6_XFRM_MODE_BEET=m ++CONFIG_INET6_XFRM_MODE_ROUTEOPTIMIZATION=m ++CONFIG_IPV6_SIT=m ++CONFIG_IPV6_TUNNEL=m ++CONFIG_IPV6_MULTIPLE_TABLES=y ++CONFIG_IPV6_MROUTE=y ++CONFIG_IPV6_PIMSM_V2=y ++CONFIG_NETLABEL=y ++CONFIG_NETFILTER=y ++CONFIG_NF_CONNTRACK=m ++CONFIG_NF_CONNTRACK_SECMARK=y ++CONFIG_NF_CONNTRACK_ZONES=y ++CONFIG_NF_CONNTRACK_EVENTS=y ++CONFIG_NF_CT_PROTO_DCCP=m ++CONFIG_NF_CT_PROTO_UDPLITE=m ++CONFIG_NF_CONNTRACK_AMANDA=m ++CONFIG_NF_CONNTRACK_FTP=m ++CONFIG_NF_CONNTRACK_H323=m ++CONFIG_NF_CONNTRACK_IRC=m ++CONFIG_NF_CONNTRACK_NETBIOS_NS=m ++CONFIG_NF_CONNTRACK_PPTP=m ++CONFIG_NF_CONNTRACK_SANE=m ++CONFIG_NF_CONNTRACK_SIP=m ++CONFIG_NF_CONNTRACK_TFTP=m ++CONFIG_NETFILTER_XT_TARGET_CLASSIFY=m ++CONFIG_NETFILTER_XT_TARGET_CONNMARK=m ++CONFIG_NETFILTER_XT_TARGET_CONNSECMARK=m ++CONFIG_NETFILTER_XT_TARGET_DSCP=m ++CONFIG_NETFILTER_XT_TARGET_IDLETIMER=m ++CONFIG_NETFILTER_XT_TARGET_MARK=m ++CONFIG_NETFILTER_XT_TARGET_NFLOG=m ++CONFIG_NETFILTER_XT_TARGET_NFQUEUE=m ++CONFIG_NETFILTER_XT_TARGET_NOTRACK=m ++CONFIG_NETFILTER_XT_TARGET_TEE=m ++CONFIG_NETFILTER_XT_TARGET_TPROXY=m ++CONFIG_NETFILTER_XT_TARGET_TRACE=m ++CONFIG_NETFILTER_XT_TARGET_SECMARK=m ++CONFIG_NETFILTER_XT_TARGET_TCPMSS=m ++CONFIG_NETFILTER_XT_TARGET_TCPOPTSTRIP=m ++CONFIG_NETFILTER_XT_MATCH_CLUSTER=m ++CONFIG_NETFILTER_XT_MATCH_COMMENT=m ++CONFIG_NETFILTER_XT_MATCH_CONNBYTES=m ++CONFIG_NETFILTER_XT_MATCH_CONNLIMIT=m ++CONFIG_NETFILTER_XT_MATCH_CONNMARK=m ++CONFIG_NETFILTER_XT_MATCH_CONNTRACK=m ++CONFIG_NETFILTER_XT_MATCH_DCCP=m ++CONFIG_NETFILTER_XT_MATCH_DSCP=m ++CONFIG_NETFILTER_XT_MATCH_ESP=m ++CONFIG_NETFILTER_XT_MATCH_HASHLIMIT=m ++CONFIG_NETFILTER_XT_MATCH_HELPER=m ++CONFIG_NETFILTER_XT_MATCH_IPRANGE=m ++CONFIG_NETFILTER_XT_MATCH_IPVS=m ++CONFIG_NETFILTER_XT_MATCH_LENGTH=m ++CONFIG_NETFILTER_XT_MATCH_LIMIT=m ++CONFIG_NETFILTER_XT_MATCH_MAC=m ++CONFIG_NETFILTER_XT_MATCH_MARK=m ++CONFIG_NETFILTER_XT_MATCH_MULTIPORT=m ++CONFIG_NETFILTER_XT_MATCH_OSF=m ++CONFIG_NETFILTER_XT_MATCH_OWNER=m ++CONFIG_NETFILTER_XT_MATCH_POLICY=m ++CONFIG_NETFILTER_XT_MATCH_PHYSDEV=m ++CONFIG_NETFILTER_XT_MATCH_PKTTYPE=m ++CONFIG_NETFILTER_XT_MATCH_QUOTA=m ++CONFIG_NETFILTER_XT_MATCH_RATEEST=m ++CONFIG_NETFILTER_XT_MATCH_REALM=m ++CONFIG_NETFILTER_XT_MATCH_RECENT=m ++CONFIG_NETFILTER_XT_MATCH_SOCKET=m ++CONFIG_NETFILTER_XT_MATCH_STATE=m ++CONFIG_NETFILTER_XT_MATCH_STATISTIC=m ++CONFIG_NETFILTER_XT_MATCH_STRING=m ++CONFIG_NETFILTER_XT_MATCH_TCPMSS=m ++CONFIG_NETFILTER_XT_MATCH_TIME=m ++CONFIG_NETFILTER_XT_MATCH_U32=m ++CONFIG_IP_VS=m ++CONFIG_IP_VS_IPV6=y ++CONFIG_IP_VS_PROTO_TCP=y ++CONFIG_IP_VS_PROTO_UDP=y ++CONFIG_IP_VS_PROTO_ESP=y ++CONFIG_IP_VS_PROTO_AH=y ++CONFIG_IP_VS_PROTO_SCTP=y ++CONFIG_IP_VS_RR=m ++CONFIG_IP_VS_WRR=m ++CONFIG_IP_VS_LC=m ++CONFIG_IP_VS_WLC=m ++CONFIG_IP_VS_LBLC=m ++CONFIG_IP_VS_LBLCR=m ++CONFIG_IP_VS_SED=m ++CONFIG_IP_VS_NQ=m ++CONFIG_NF_CONNTRACK_IPV4=m ++# CONFIG_NF_CONNTRACK_PROC_COMPAT is not set ++CONFIG_IP_NF_IPTABLES=y ++CONFIG_IP_NF_MATCH_AH=m ++CONFIG_IP_NF_MATCH_ECN=m ++CONFIG_IP_NF_MATCH_TTL=m ++CONFIG_IP_NF_FILTER=y ++CONFIG_IP_NF_TARGET_REJECT=y ++CONFIG_IP_NF_MANGLE=m ++CONFIG_IP_NF_TARGET_ECN=m ++CONFIG_IP_NF_TARGET_TTL=m ++CONFIG_IP_NF_RAW=m ++CONFIG_IP_NF_SECURITY=m ++CONFIG_IP_NF_ARPTABLES=m ++CONFIG_IP_NF_ARPFILTER=m ++CONFIG_IP_NF_ARP_MANGLE=m ++CONFIG_NF_CONNTRACK_IPV6=m ++CONFIG_IP6_NF_MATCH_AH=m ++CONFIG_IP6_NF_MATCH_EUI64=m ++CONFIG_IP6_NF_MATCH_FRAG=m ++CONFIG_IP6_NF_MATCH_OPTS=m ++CONFIG_IP6_NF_MATCH_HL=m ++CONFIG_IP6_NF_MATCH_IPV6HEADER=m ++CONFIG_IP6_NF_MATCH_MH=m ++CONFIG_IP6_NF_MATCH_RT=m ++CONFIG_IP6_NF_TARGET_HL=m ++CONFIG_IP6_NF_FILTER=m ++CONFIG_IP6_NF_TARGET_REJECT=m ++CONFIG_IP6_NF_MANGLE=m ++CONFIG_IP6_NF_RAW=m ++CONFIG_IP6_NF_SECURITY=m ++CONFIG_BRIDGE_NF_EBTABLES=m ++CONFIG_BRIDGE_EBT_BROUTE=m ++CONFIG_BRIDGE_EBT_T_FILTER=m ++CONFIG_BRIDGE_EBT_T_NAT=m ++CONFIG_BRIDGE_EBT_802_3=m ++CONFIG_BRIDGE_EBT_AMONG=m ++CONFIG_BRIDGE_EBT_ARP=m ++CONFIG_BRIDGE_EBT_IP=m ++CONFIG_BRIDGE_EBT_IP6=m ++CONFIG_BRIDGE_EBT_LIMIT=m ++CONFIG_BRIDGE_EBT_MARK=m ++CONFIG_BRIDGE_EBT_PKTTYPE=m ++CONFIG_BRIDGE_EBT_STP=m ++CONFIG_BRIDGE_EBT_VLAN=m ++CONFIG_BRIDGE_EBT_ARPREPLY=m ++CONFIG_BRIDGE_EBT_DNAT=m ++CONFIG_BRIDGE_EBT_MARK_T=m ++CONFIG_BRIDGE_EBT_REDIRECT=m ++CONFIG_BRIDGE_EBT_SNAT=m ++CONFIG_BRIDGE_EBT_LOG=m ++CONFIG_BRIDGE_EBT_ULOG=m ++CONFIG_BRIDGE_EBT_NFLOG=m ++CONFIG_RDS=m ++CONFIG_RDS_TCP=m ++CONFIG_BRIDGE=m ++CONFIG_VLAN_8021Q=m ++CONFIG_VLAN_8021Q_GVRP=y ++CONFIG_PHONET=m ++CONFIG_NET_SCHED=y ++CONFIG_NET_SCH_CBQ=m ++CONFIG_NET_SCH_HTB=m ++CONFIG_NET_SCH_HFSC=m ++CONFIG_NET_SCH_PRIO=m ++CONFIG_NET_SCH_MULTIQ=m ++CONFIG_NET_SCH_RED=m ++CONFIG_NET_SCH_SFQ=m ++CONFIG_NET_SCH_TEQL=m ++CONFIG_NET_SCH_TBF=m ++CONFIG_NET_SCH_GRED=m ++CONFIG_NET_SCH_DSMARK=m ++CONFIG_NET_SCH_NETEM=m ++CONFIG_NET_SCH_DRR=m ++CONFIG_NET_SCH_INGRESS=m ++CONFIG_NET_CLS_BASIC=m ++CONFIG_NET_CLS_TCINDEX=m ++CONFIG_NET_CLS_ROUTE4=m ++CONFIG_NET_CLS_FW=m ++CONFIG_NET_CLS_U32=m ++CONFIG_CLS_U32_PERF=y ++CONFIG_CLS_U32_MARK=y ++CONFIG_NET_CLS_RSVP=m ++CONFIG_NET_CLS_RSVP6=m ++CONFIG_NET_CLS_FLOW=m ++CONFIG_NET_CLS_CGROUP=y ++CONFIG_NET_EMATCH=y ++CONFIG_NET_EMATCH_CMP=m ++CONFIG_NET_EMATCH_NBYTE=m ++CONFIG_NET_EMATCH_U32=m ++CONFIG_NET_EMATCH_META=m ++CONFIG_NET_EMATCH_TEXT=m ++CONFIG_NET_CLS_ACT=y ++CONFIG_NET_ACT_POLICE=m ++CONFIG_NET_ACT_GACT=m ++CONFIG_GACT_PROB=y ++CONFIG_NET_ACT_MIRRED=m ++CONFIG_NET_ACT_IPT=m ++CONFIG_NET_ACT_NAT=m ++CONFIG_NET_ACT_PEDIT=m ++CONFIG_NET_ACT_SIMP=m ++CONFIG_NET_ACT_SKBEDIT=m ++CONFIG_NET_CLS_IND=y ++CONFIG_DCB=y ++CONFIG_DNS_RESOLVER=y ++# CONFIG_WIRELESS is not set ++CONFIG_UEVENT_HELPER_PATH="/sbin/hotplug" ++CONFIG_DEVTMPFS=y ++CONFIG_DEVTMPFS_MOUNT=y ++CONFIG_CONNECTOR=y ++CONFIG_BLK_DEV_LOOP=y ++CONFIG_BLK_DEV_CRYPTOLOOP=m ++CONFIG_BLK_DEV_SX8=m ++CONFIG_BLK_DEV_RAM=y ++CONFIG_BLK_DEV_RAM_SIZE=16384 ++CONFIG_ATA_OVER_ETH=m ++CONFIG_RAID_ATTRS=m ++CONFIG_BLK_DEV_SD=y ++CONFIG_SCSI_CONSTANTS=y ++CONFIG_SCSI_LOGGING=y ++CONFIG_ATA=y ++CONFIG_SATA_SIL24=y ++# CONFIG_ATA_SFF is not set ++CONFIG_MD=y ++CONFIG_BLK_DEV_MD=y ++CONFIG_MD_LINEAR=m ++CONFIG_MD_RAID0=m ++CONFIG_MD_RAID1=m ++CONFIG_MD_RAID10=m ++CONFIG_MD_RAID456=m ++CONFIG_MD_FAULTY=m ++CONFIG_BLK_DEV_DM=m ++CONFIG_DM_DEBUG=y ++CONFIG_DM_CRYPT=m ++CONFIG_DM_SNAPSHOT=m ++CONFIG_DM_MIRROR=m ++CONFIG_DM_LOG_USERSPACE=m ++CONFIG_DM_ZERO=m ++CONFIG_DM_MULTIPATH=m ++CONFIG_DM_MULTIPATH_QL=m ++CONFIG_DM_MULTIPATH_ST=m ++CONFIG_DM_DELAY=m ++CONFIG_DM_UEVENT=y ++CONFIG_FUSION=y ++CONFIG_FUSION_SAS=y ++CONFIG_NETDEVICES=y ++CONFIG_BONDING=m ++CONFIG_DUMMY=m ++CONFIG_IFB=m ++CONFIG_MACVLAN=m ++CONFIG_MACVTAP=m ++CONFIG_NETCONSOLE=m ++CONFIG_NETCONSOLE_DYNAMIC=y ++CONFIG_TUN=y ++CONFIG_VETH=m ++CONFIG_NET_DSA_MV88E6060=y ++CONFIG_NET_DSA_MV88E6XXX=y ++# CONFIG_NET_VENDOR_3COM is not set ++CONFIG_E1000E=y ++# CONFIG_WLAN is not set ++# CONFIG_INPUT_MOUSEDEV is not set ++# CONFIG_INPUT_KEYBOARD is not set ++# CONFIG_INPUT_MOUSE is not set ++# CONFIG_SERIO is not set ++# CONFIG_VT is not set ++# CONFIG_LEGACY_PTYS is not set ++CONFIG_HW_RANDOM=y ++CONFIG_HW_RANDOM_TIMERIOMEM=m ++CONFIG_I2C=y ++CONFIG_I2C_CHARDEV=y ++# CONFIG_HWMON is not set ++CONFIG_WATCHDOG=y ++CONFIG_WATCHDOG_NOWAYOUT=y ++# CONFIG_VGA_ARB is not set ++# CONFIG_USB_SUPPORT is not set ++CONFIG_EDAC=y ++CONFIG_RTC_CLASS=y ++CONFIG_RTC_DRV_TILE=y ++CONFIG_EXT2_FS=y ++CONFIG_EXT2_FS_XATTR=y ++CONFIG_EXT2_FS_POSIX_ACL=y ++CONFIG_EXT2_FS_SECURITY=y ++CONFIG_EXT2_FS_XIP=y ++CONFIG_EXT3_FS=y ++CONFIG_EXT3_FS_POSIX_ACL=y ++CONFIG_EXT3_FS_SECURITY=y ++CONFIG_EXT4_FS=y ++CONFIG_EXT4_FS_POSIX_ACL=y ++CONFIG_EXT4_FS_SECURITY=y ++CONFIG_XFS_FS=y ++CONFIG_XFS_QUOTA=y ++CONFIG_XFS_POSIX_ACL=y ++CONFIG_GFS2_FS=m ++CONFIG_GFS2_FS_LOCKING_DLM=y ++CONFIG_BTRFS_FS=m ++CONFIG_BTRFS_FS_POSIX_ACL=y ++CONFIG_QUOTA=y ++CONFIG_QUOTA_NETLINK_INTERFACE=y ++# CONFIG_PRINT_QUOTA_WARNING is not set ++CONFIG_QFMT_V2=y ++CONFIG_AUTOFS4_FS=m ++CONFIG_FUSE_FS=y ++CONFIG_CUSE=m ++CONFIG_FSCACHE=m ++CONFIG_FSCACHE_STATS=y ++CONFIG_CACHEFILES=m ++CONFIG_ISO9660_FS=m ++CONFIG_JOLIET=y ++CONFIG_ZISOFS=y ++CONFIG_UDF_FS=m ++CONFIG_MSDOS_FS=m ++CONFIG_VFAT_FS=m ++CONFIG_FAT_DEFAULT_IOCHARSET="ascii" ++CONFIG_PROC_KCORE=y ++CONFIG_TMPFS=y ++CONFIG_TMPFS_POSIX_ACL=y ++CONFIG_HUGETLBFS=y ++CONFIG_CONFIGFS_FS=m ++CONFIG_ECRYPT_FS=m ++CONFIG_CRAMFS=m ++CONFIG_SQUASHFS=m ++CONFIG_NFS_FS=m ++CONFIG_NFS_V3_ACL=y ++CONFIG_NFS_V4=m ++CONFIG_NFS_V4_1=y ++CONFIG_NFS_FSCACHE=y ++CONFIG_NFSD=m ++CONFIG_NFSD_V3_ACL=y ++CONFIG_NFSD_V4=y ++CONFIG_CIFS=m ++CONFIG_CIFS_STATS=y ++CONFIG_CIFS_WEAK_PW_HASH=y ++CONFIG_CIFS_UPCALL=y ++CONFIG_CIFS_XATTR=y ++CONFIG_CIFS_POSIX=y ++CONFIG_CIFS_DFS_UPCALL=y ++CONFIG_CIFS_FSCACHE=y ++CONFIG_NLS=y ++CONFIG_NLS_DEFAULT="utf8" ++CONFIG_NLS_CODEPAGE_437=y ++CONFIG_NLS_CODEPAGE_737=m ++CONFIG_NLS_CODEPAGE_775=m ++CONFIG_NLS_CODEPAGE_850=m ++CONFIG_NLS_CODEPAGE_852=m ++CONFIG_NLS_CODEPAGE_855=m ++CONFIG_NLS_CODEPAGE_857=m ++CONFIG_NLS_CODEPAGE_860=m ++CONFIG_NLS_CODEPAGE_861=m ++CONFIG_NLS_CODEPAGE_862=m ++CONFIG_NLS_CODEPAGE_863=m ++CONFIG_NLS_CODEPAGE_864=m ++CONFIG_NLS_CODEPAGE_865=m ++CONFIG_NLS_CODEPAGE_866=m ++CONFIG_NLS_CODEPAGE_869=m ++CONFIG_NLS_CODEPAGE_936=m ++CONFIG_NLS_CODEPAGE_950=m ++CONFIG_NLS_CODEPAGE_932=m ++CONFIG_NLS_CODEPAGE_949=m ++CONFIG_NLS_CODEPAGE_874=m ++CONFIG_NLS_ISO8859_8=m ++CONFIG_NLS_CODEPAGE_1250=m ++CONFIG_NLS_CODEPAGE_1251=m ++CONFIG_NLS_ASCII=y ++CONFIG_NLS_ISO8859_1=m ++CONFIG_NLS_ISO8859_2=m ++CONFIG_NLS_ISO8859_3=m ++CONFIG_NLS_ISO8859_4=m ++CONFIG_NLS_ISO8859_5=m ++CONFIG_NLS_ISO8859_6=m ++CONFIG_NLS_ISO8859_7=m ++CONFIG_NLS_ISO8859_9=m ++CONFIG_NLS_ISO8859_13=m ++CONFIG_NLS_ISO8859_14=m ++CONFIG_NLS_ISO8859_15=m ++CONFIG_NLS_KOI8_R=m ++CONFIG_NLS_KOI8_U=m ++CONFIG_NLS_UTF8=m ++CONFIG_DLM=m ++CONFIG_DLM_DEBUG=y ++CONFIG_DYNAMIC_DEBUG=y ++CONFIG_DEBUG_INFO=y ++CONFIG_DEBUG_INFO_REDUCED=y ++# CONFIG_ENABLE_WARN_DEPRECATED is not set ++CONFIG_FRAME_WARN=2048 ++CONFIG_STRIP_ASM_SYMS=y ++CONFIG_DEBUG_FS=y ++CONFIG_HEADERS_CHECK=y ++# CONFIG_FRAME_POINTER is not set ++CONFIG_DEBUG_FORCE_WEAK_PER_CPU=y ++CONFIG_MAGIC_SYSRQ=y ++CONFIG_DEBUG_VM=y ++CONFIG_DEBUG_MEMORY_INIT=y ++CONFIG_DEBUG_STACKOVERFLOW=y ++CONFIG_LOCKUP_DETECTOR=y ++CONFIG_SCHEDSTATS=y ++CONFIG_TIMER_STATS=y ++CONFIG_DEBUG_LIST=y ++CONFIG_DEBUG_CREDENTIALS=y ++CONFIG_RCU_CPU_STALL_TIMEOUT=60 ++CONFIG_ASYNC_RAID6_TEST=m ++CONFIG_SECURITY=y ++CONFIG_SECURITYFS=y ++CONFIG_SECURITY_NETWORK=y ++CONFIG_SECURITY_NETWORK_XFRM=y ++CONFIG_SECURITY_SELINUX=y ++CONFIG_SECURITY_SELINUX_BOOTPARAM=y ++CONFIG_SECURITY_SELINUX_DISABLE=y ++CONFIG_CRYPTO_PCRYPT=m ++CONFIG_CRYPTO_CRYPTD=m ++CONFIG_CRYPTO_TEST=m ++CONFIG_CRYPTO_CCM=m ++CONFIG_CRYPTO_GCM=m ++CONFIG_CRYPTO_CTS=m ++CONFIG_CRYPTO_LRW=m ++CONFIG_CRYPTO_PCBC=m ++CONFIG_CRYPTO_XTS=m ++CONFIG_CRYPTO_HMAC=y ++CONFIG_CRYPTO_XCBC=m ++CONFIG_CRYPTO_VMAC=m ++CONFIG_CRYPTO_MICHAEL_MIC=m ++CONFIG_CRYPTO_RMD128=m ++CONFIG_CRYPTO_RMD160=m ++CONFIG_CRYPTO_RMD256=m ++CONFIG_CRYPTO_RMD320=m ++CONFIG_CRYPTO_SHA1=y ++CONFIG_CRYPTO_SHA512=m ++CONFIG_CRYPTO_TGR192=m ++CONFIG_CRYPTO_WP512=m ++CONFIG_CRYPTO_ANUBIS=m ++CONFIG_CRYPTO_BLOWFISH=m ++CONFIG_CRYPTO_CAMELLIA=m ++CONFIG_CRYPTO_CAST5=m ++CONFIG_CRYPTO_CAST6=m ++CONFIG_CRYPTO_FCRYPT=m ++CONFIG_CRYPTO_KHAZAD=m ++CONFIG_CRYPTO_SEED=m ++CONFIG_CRYPTO_SERPENT=m ++CONFIG_CRYPTO_TEA=m ++CONFIG_CRYPTO_TWOFISH=m ++CONFIG_CRYPTO_LZO=m ++CONFIG_CRC_CCITT=m ++CONFIG_CRC7=m +diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig +index 8ef85139553f..6f6ecda60d5b 100644 +--- a/arch/x86/Kconfig ++++ b/arch/x86/Kconfig +@@ -1034,6 +1034,22 @@ config NR_CPUS + config SCHED_SMT + def_bool y if SMP + ++config SMT_NICE ++ bool "SMT (Hyperthreading) aware nice priority and policy support" ++ depends on SCHED_MUQSS && SCHED_SMT ++ default y ++ ---help--- ++ Enabling Hyperthreading on Intel CPUs decreases the effectiveness ++ of the use of 'nice' levels and different scheduling policies ++ (e.g. realtime) due to sharing of CPU power between hyperthreads. ++ SMT nice support makes each logical CPU aware of what is running on ++ its hyperthread siblings, maintaining appropriate distribution of ++ CPU according to nice levels and scheduling policies at the expense ++ of slightly increased overhead. ++ ++ If unsure say Y here. ++ ++ + config SCHED_MC + def_bool y + prompt "Multi-core scheduler support" +@@ -1064,6 +1080,8 @@ config SCHED_MC_PRIO + + If unsure say Y here. + ++source "kernel/Kconfig.MuQSS" ++ + config UP_LATE_INIT + def_bool y + depends on !SMP && X86_LOCAL_APIC +@@ -1433,7 +1451,7 @@ config HIGHMEM64G + endchoice + + choice +- prompt "Memory split" if EXPERT ++ prompt "Memory split" + default VMSPLIT_3G + depends on X86_32 + ---help--- +@@ -1453,17 +1471,17 @@ choice + option alone! + + config VMSPLIT_3G +- bool "3G/1G user/kernel split" ++ bool "Default 896MB lowmem (3G/1G user/kernel split)" + config VMSPLIT_3G_OPT + depends on !X86_PAE +- bool "3G/1G user/kernel split (for full 1G low memory)" ++ bool "1GB lowmem (3G/1G user/kernel split)" + config VMSPLIT_2G +- bool "2G/2G user/kernel split" ++ bool "2GB lowmem (2G/2G user/kernel split)" + config VMSPLIT_2G_OPT + depends on !X86_PAE +- bool "2G/2G user/kernel split (for full 2G low memory)" ++ bool "2GB lowmem (2G/2G user/kernel split)" + config VMSPLIT_1G +- bool "1G/3G user/kernel split" ++ bool "3GB lowmem (1G/3G user/kernel split)" + endchoice + + config PAGE_OFFSET +diff --git a/arch/x86/configs/i386_defconfig b/arch/x86/configs/i386_defconfig +index 59ce9ed58430..f19741b0f43d 100644 +--- a/arch/x86/configs/i386_defconfig ++++ b/arch/x86/configs/i386_defconfig +@@ -29,7 +29,7 @@ CONFIG_SMP=y + CONFIG_X86_GENERIC=y + CONFIG_HPET_TIMER=y + CONFIG_SCHED_SMT=y +-CONFIG_PREEMPT_VOLUNTARY=y ++CONFIG_PREEMPT=y + CONFIG_X86_REROUTE_FOR_BROKEN_BOOT_IRQS=y + CONFIG_X86_MCE=y + CONFIG_X86_REBOOTFIXUPS=y +diff --git a/arch/x86/configs/x86_64_defconfig b/arch/x86/configs/x86_64_defconfig +index d0a5ffeae8df..63f1fb92590c 100644 +--- a/arch/x86/configs/x86_64_defconfig ++++ b/arch/x86/configs/x86_64_defconfig +@@ -28,7 +28,7 @@ CONFIG_SMP=y + CONFIG_CALGARY_IOMMU=y + CONFIG_NR_CPUS=64 + CONFIG_SCHED_SMT=y +-CONFIG_PREEMPT_VOLUNTARY=y ++CONFIG_PREEMPT=y + CONFIG_X86_REROUTE_FOR_BROKEN_BOOT_IRQS=y + CONFIG_X86_MCE=y + CONFIG_MICROCODE=y +diff --git a/drivers/block/swim.c b/drivers/block/swim.c +index 4c297f69171d..5bc4f1be2617 100644 +--- a/drivers/block/swim.c ++++ b/drivers/block/swim.c +@@ -328,7 +328,7 @@ static inline void swim_motor(struct swim __iomem *base, + if (swim_readbit(base, MOTOR_ON)) + break; + current->state = TASK_INTERRUPTIBLE; +- schedule_timeout(1); ++ schedule_min_hrtimeout(); + } + } else if (action == OFF) { + swim_action(base, MOTOR_OFF); +@@ -347,7 +347,7 @@ static inline void swim_eject(struct swim __iomem *base) + if (!swim_readbit(base, DISK_IN)) + break; + current->state = TASK_INTERRUPTIBLE; +- schedule_timeout(1); ++ schedule_min_hrtimeout(); + } + swim_select(base, RELAX); + } +@@ -371,7 +371,7 @@ static inline int swim_step(struct swim __iomem *base) + for (wait = 0; wait < HZ; wait++) { + + current->state = TASK_INTERRUPTIBLE; +- schedule_timeout(1); ++ schedule_min_hrtimeout(); + + swim_select(base, RELAX); + if (!swim_readbit(base, STEP)) +diff --git a/drivers/char/ipmi/ipmi_msghandler.c b/drivers/char/ipmi/ipmi_msghandler.c +index 2aab80e19ae0..6200dbb3b5ef 100644 +--- a/drivers/char/ipmi/ipmi_msghandler.c ++++ b/drivers/char/ipmi/ipmi_msghandler.c +@@ -3544,7 +3544,7 @@ static void cleanup_smi_msgs(struct ipmi_smi *intf) + /* Current message first, to preserve order */ + while (intf->curr_msg && !list_empty(&intf->waiting_rcv_msgs)) { + /* Wait for the message to clear out. */ +- schedule_timeout(1); ++ schedule_min_hrtimeout(); + } + + /* No need for locks, the interface is down. */ +diff --git a/drivers/char/ipmi/ipmi_ssif.c b/drivers/char/ipmi/ipmi_ssif.c +index 22c6a2e61236..c4bccd444cbf 100644 +--- a/drivers/char/ipmi/ipmi_ssif.c ++++ b/drivers/char/ipmi/ipmi_ssif.c +@@ -1289,7 +1289,7 @@ static void shutdown_ssif(void *send_info) + + /* make sure the driver is not looking for flags any more. */ + while (ssif_info->ssif_state != SSIF_NORMAL) +- schedule_timeout(1); ++ schedule_min_hrtimeout(); + + ssif_info->stopping = true; + del_timer_sync(&ssif_info->watch_timer); +diff --git a/drivers/char/snsc.c b/drivers/char/snsc.c +new file mode 100644 +index 000000000000..5228e78df804 +--- /dev/null ++++ b/drivers/char/snsc.c +@@ -0,0 +1,469 @@ ++/* ++ * SN Platform system controller communication support ++ * ++ * This file is subject to the terms and conditions of the GNU General Public ++ * License. See the file "COPYING" in the main directory of this archive ++ * for more details. ++ * ++ * Copyright (C) 2004, 2006 Silicon Graphics, Inc. All rights reserved. ++ */ ++ ++/* ++ * System controller communication driver ++ * ++ * This driver allows a user process to communicate with the system ++ * controller (a.k.a. "IRouter") network in an SGI SN system. ++ */ ++ ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include "snsc.h" ++ ++#define SYSCTL_BASENAME "snsc" ++ ++#define SCDRV_BUFSZ 2048 ++#define SCDRV_TIMEOUT 1000 ++ ++static DEFINE_MUTEX(scdrv_mutex); ++static irqreturn_t ++scdrv_interrupt(int irq, void *subch_data) ++{ ++ struct subch_data_s *sd = subch_data; ++ unsigned long flags; ++ int status; ++ ++ spin_lock_irqsave(&sd->sd_rlock, flags); ++ spin_lock(&sd->sd_wlock); ++ status = ia64_sn_irtr_intr(sd->sd_nasid, sd->sd_subch); ++ ++ if (status > 0) { ++ if (status & SAL_IROUTER_INTR_RECV) { ++ wake_up(&sd->sd_rq); ++ } ++ if (status & SAL_IROUTER_INTR_XMIT) { ++ ia64_sn_irtr_intr_disable ++ (sd->sd_nasid, sd->sd_subch, ++ SAL_IROUTER_INTR_XMIT); ++ wake_up(&sd->sd_wq); ++ } ++ } ++ spin_unlock(&sd->sd_wlock); ++ spin_unlock_irqrestore(&sd->sd_rlock, flags); ++ return IRQ_HANDLED; ++} ++ ++/* ++ * scdrv_open ++ * ++ * Reserve a subchannel for system controller communication. ++ */ ++ ++static int ++scdrv_open(struct inode *inode, struct file *file) ++{ ++ struct sysctl_data_s *scd; ++ struct subch_data_s *sd; ++ int rv; ++ ++ /* look up device info for this device file */ ++ scd = container_of(inode->i_cdev, struct sysctl_data_s, scd_cdev); ++ ++ /* allocate memory for subchannel data */ ++ sd = kzalloc(sizeof (struct subch_data_s), GFP_KERNEL); ++ if (sd == NULL) { ++ printk("%s: couldn't allocate subchannel data\n", ++ __func__); ++ return -ENOMEM; ++ } ++ ++ /* initialize subch_data_s fields */ ++ sd->sd_nasid = scd->scd_nasid; ++ sd->sd_subch = ia64_sn_irtr_open(scd->scd_nasid); ++ ++ if (sd->sd_subch < 0) { ++ kfree(sd); ++ printk("%s: couldn't allocate subchannel\n", __func__); ++ return -EBUSY; ++ } ++ ++ spin_lock_init(&sd->sd_rlock); ++ spin_lock_init(&sd->sd_wlock); ++ init_waitqueue_head(&sd->sd_rq); ++ init_waitqueue_head(&sd->sd_wq); ++ sema_init(&sd->sd_rbs, 1); ++ sema_init(&sd->sd_wbs, 1); ++ ++ file->private_data = sd; ++ ++ /* hook this subchannel up to the system controller interrupt */ ++ mutex_lock(&scdrv_mutex); ++ rv = request_irq(SGI_UART_VECTOR, scdrv_interrupt, ++ IRQF_SHARED, SYSCTL_BASENAME, sd); ++ if (rv) { ++ ia64_sn_irtr_close(sd->sd_nasid, sd->sd_subch); ++ kfree(sd); ++ printk("%s: irq request failed (%d)\n", __func__, rv); ++ mutex_unlock(&scdrv_mutex); ++ return -EBUSY; ++ } ++ mutex_unlock(&scdrv_mutex); ++ return 0; ++} ++ ++/* ++ * scdrv_release ++ * ++ * Release a previously-reserved subchannel. ++ */ ++ ++static int ++scdrv_release(struct inode *inode, struct file *file) ++{ ++ struct subch_data_s *sd = (struct subch_data_s *) file->private_data; ++ int rv; ++ ++ /* free the interrupt */ ++ free_irq(SGI_UART_VECTOR, sd); ++ ++ /* ask SAL to close the subchannel */ ++ rv = ia64_sn_irtr_close(sd->sd_nasid, sd->sd_subch); ++ ++ kfree(sd); ++ return rv; ++} ++ ++/* ++ * scdrv_read ++ * ++ * Called to read bytes from the open IRouter pipe. ++ * ++ */ ++ ++static inline int ++read_status_check(struct subch_data_s *sd, int *len) ++{ ++ return ia64_sn_irtr_recv(sd->sd_nasid, sd->sd_subch, sd->sd_rb, len); ++} ++ ++static ssize_t ++scdrv_read(struct file *file, char __user *buf, size_t count, loff_t *f_pos) ++{ ++ int status; ++ int len; ++ unsigned long flags; ++ struct subch_data_s *sd = (struct subch_data_s *) file->private_data; ++ ++ /* try to get control of the read buffer */ ++ if (down_trylock(&sd->sd_rbs)) { ++ /* somebody else has it now; ++ * if we're non-blocking, then exit... ++ */ ++ if (file->f_flags & O_NONBLOCK) { ++ return -EAGAIN; ++ } ++ /* ...or if we want to block, then do so here */ ++ if (down_interruptible(&sd->sd_rbs)) { ++ /* something went wrong with wait */ ++ return -ERESTARTSYS; ++ } ++ } ++ ++ /* anything to read? */ ++ len = CHUNKSIZE; ++ spin_lock_irqsave(&sd->sd_rlock, flags); ++ status = read_status_check(sd, &len); ++ ++ /* if not, and we're blocking I/O, loop */ ++ while (status < 0) { ++ DECLARE_WAITQUEUE(wait, current); ++ ++ if (file->f_flags & O_NONBLOCK) { ++ spin_unlock_irqrestore(&sd->sd_rlock, flags); ++ up(&sd->sd_rbs); ++ return -EAGAIN; ++ } ++ ++ len = CHUNKSIZE; ++ set_current_state(TASK_INTERRUPTIBLE); ++ add_wait_queue(&sd->sd_rq, &wait); ++ spin_unlock_irqrestore(&sd->sd_rlock, flags); ++ ++ schedule_msec_hrtimeout((SCDRV_TIMEOUT)); ++ ++ remove_wait_queue(&sd->sd_rq, &wait); ++ if (signal_pending(current)) { ++ /* wait was interrupted */ ++ up(&sd->sd_rbs); ++ return -ERESTARTSYS; ++ } ++ ++ spin_lock_irqsave(&sd->sd_rlock, flags); ++ status = read_status_check(sd, &len); ++ } ++ spin_unlock_irqrestore(&sd->sd_rlock, flags); ++ ++ if (len > 0) { ++ /* we read something in the last read_status_check(); copy ++ * it out to user space ++ */ ++ if (count < len) { ++ pr_debug("%s: only accepting %d of %d bytes\n", ++ __func__, (int) count, len); ++ } ++ len = min((int) count, len); ++ if (copy_to_user(buf, sd->sd_rb, len)) ++ len = -EFAULT; ++ } ++ ++ /* release the read buffer and wake anyone who might be ++ * waiting for it ++ */ ++ up(&sd->sd_rbs); ++ ++ /* return the number of characters read in */ ++ return len; ++} ++ ++/* ++ * scdrv_write ++ * ++ * Writes a chunk of an IRouter packet (or other system controller data) ++ * to the system controller. ++ * ++ */ ++static inline int ++write_status_check(struct subch_data_s *sd, int count) ++{ ++ return ia64_sn_irtr_send(sd->sd_nasid, sd->sd_subch, sd->sd_wb, count); ++} ++ ++static ssize_t ++scdrv_write(struct file *file, const char __user *buf, ++ size_t count, loff_t *f_pos) ++{ ++ unsigned long flags; ++ int status; ++ struct subch_data_s *sd = (struct subch_data_s *) file->private_data; ++ ++ /* try to get control of the write buffer */ ++ if (down_trylock(&sd->sd_wbs)) { ++ /* somebody else has it now; ++ * if we're non-blocking, then exit... ++ */ ++ if (file->f_flags & O_NONBLOCK) { ++ return -EAGAIN; ++ } ++ /* ...or if we want to block, then do so here */ ++ if (down_interruptible(&sd->sd_wbs)) { ++ /* something went wrong with wait */ ++ return -ERESTARTSYS; ++ } ++ } ++ ++ count = min((int) count, CHUNKSIZE); ++ if (copy_from_user(sd->sd_wb, buf, count)) { ++ up(&sd->sd_wbs); ++ return -EFAULT; ++ } ++ ++ /* try to send the buffer */ ++ spin_lock_irqsave(&sd->sd_wlock, flags); ++ status = write_status_check(sd, count); ++ ++ /* if we failed, and we want to block, then loop */ ++ while (status <= 0) { ++ DECLARE_WAITQUEUE(wait, current); ++ ++ if (file->f_flags & O_NONBLOCK) { ++ spin_unlock_irqrestore(&sd->sd_wlock, flags); ++ up(&sd->sd_wbs); ++ return -EAGAIN; ++ } ++ ++ set_current_state(TASK_INTERRUPTIBLE); ++ add_wait_queue(&sd->sd_wq, &wait); ++ spin_unlock_irqrestore(&sd->sd_wlock, flags); ++ ++ schedule_msec_hrtimeout((SCDRV_TIMEOUT)); ++ ++ remove_wait_queue(&sd->sd_wq, &wait); ++ if (signal_pending(current)) { ++ /* wait was interrupted */ ++ up(&sd->sd_wbs); ++ return -ERESTARTSYS; ++ } ++ ++ spin_lock_irqsave(&sd->sd_wlock, flags); ++ status = write_status_check(sd, count); ++ } ++ spin_unlock_irqrestore(&sd->sd_wlock, flags); ++ ++ /* release the write buffer and wake anyone who's waiting for it */ ++ up(&sd->sd_wbs); ++ ++ /* return the number of characters accepted (should be the complete ++ * "chunk" as requested) ++ */ ++ if ((status >= 0) && (status < count)) { ++ pr_debug("Didn't accept the full chunk; %d of %d\n", ++ status, (int) count); ++ } ++ return status; ++} ++ ++static __poll_t ++scdrv_poll(struct file *file, struct poll_table_struct *wait) ++{ ++ __poll_t mask = 0; ++ int status = 0; ++ struct subch_data_s *sd = (struct subch_data_s *) file->private_data; ++ unsigned long flags; ++ ++ poll_wait(file, &sd->sd_rq, wait); ++ poll_wait(file, &sd->sd_wq, wait); ++ ++ spin_lock_irqsave(&sd->sd_rlock, flags); ++ spin_lock(&sd->sd_wlock); ++ status = ia64_sn_irtr_intr(sd->sd_nasid, sd->sd_subch); ++ spin_unlock(&sd->sd_wlock); ++ spin_unlock_irqrestore(&sd->sd_rlock, flags); ++ ++ if (status > 0) { ++ if (status & SAL_IROUTER_INTR_RECV) { ++ mask |= EPOLLIN | EPOLLRDNORM; ++ } ++ if (status & SAL_IROUTER_INTR_XMIT) { ++ mask |= EPOLLOUT | EPOLLWRNORM; ++ } ++ } ++ ++ return mask; ++} ++ ++static const struct file_operations scdrv_fops = { ++ .owner = THIS_MODULE, ++ .read = scdrv_read, ++ .write = scdrv_write, ++ .poll = scdrv_poll, ++ .open = scdrv_open, ++ .release = scdrv_release, ++ .llseek = noop_llseek, ++}; ++ ++static struct class *snsc_class; ++ ++/* ++ * scdrv_init ++ * ++ * Called at boot time to initialize the system controller communication ++ * facility. ++ */ ++int __init ++scdrv_init(void) ++{ ++ geoid_t geoid; ++ cnodeid_t cnode; ++ char devname[32]; ++ char *devnamep; ++ struct sysctl_data_s *scd; ++ void *salbuf; ++ dev_t first_dev, dev; ++ nasid_t event_nasid; ++ ++ if (!ia64_platform_is("sn2")) ++ return -ENODEV; ++ ++ event_nasid = ia64_sn_get_console_nasid(); ++ ++ snsc_class = class_create(THIS_MODULE, SYSCTL_BASENAME); ++ if (IS_ERR(snsc_class)) { ++ printk("%s: failed to allocate class\n", __func__); ++ return PTR_ERR(snsc_class); ++ } ++ ++ if (alloc_chrdev_region(&first_dev, 0, num_cnodes, ++ SYSCTL_BASENAME) < 0) { ++ printk("%s: failed to register SN system controller device\n", ++ __func__); ++ return -ENODEV; ++ } ++ ++ for (cnode = 0; cnode < num_cnodes; cnode++) { ++ geoid = cnodeid_get_geoid(cnode); ++ devnamep = devname; ++ format_module_id(devnamep, geo_module(geoid), ++ MODULE_FORMAT_BRIEF); ++ devnamep = devname + strlen(devname); ++ sprintf(devnamep, "^%d#%d", geo_slot(geoid), ++ geo_slab(geoid)); ++ ++ /* allocate sysctl device data */ ++ scd = kzalloc(sizeof (struct sysctl_data_s), ++ GFP_KERNEL); ++ if (!scd) { ++ printk("%s: failed to allocate device info" ++ "for %s/%s\n", __func__, ++ SYSCTL_BASENAME, devname); ++ continue; ++ } ++ ++ /* initialize sysctl device data fields */ ++ scd->scd_nasid = cnodeid_to_nasid(cnode); ++ if (!(salbuf = kmalloc(SCDRV_BUFSZ, GFP_KERNEL))) { ++ printk("%s: failed to allocate driver buffer" ++ "(%s%s)\n", __func__, ++ SYSCTL_BASENAME, devname); ++ kfree(scd); ++ continue; ++ } ++ ++ if (ia64_sn_irtr_init(scd->scd_nasid, salbuf, ++ SCDRV_BUFSZ) < 0) { ++ printk ++ ("%s: failed to initialize SAL for" ++ " system controller communication" ++ " (%s/%s): outdated PROM?\n", ++ __func__, SYSCTL_BASENAME, devname); ++ kfree(scd); ++ kfree(salbuf); ++ continue; ++ } ++ ++ dev = first_dev + cnode; ++ cdev_init(&scd->scd_cdev, &scdrv_fops); ++ if (cdev_add(&scd->scd_cdev, dev, 1)) { ++ printk("%s: failed to register system" ++ " controller device (%s%s)\n", ++ __func__, SYSCTL_BASENAME, devname); ++ kfree(scd); ++ kfree(salbuf); ++ continue; ++ } ++ ++ device_create(snsc_class, NULL, dev, NULL, ++ "%s", devname); ++ ++ ia64_sn_irtr_intr_enable(scd->scd_nasid, ++ 0 /*ignored */ , ++ SAL_IROUTER_INTR_RECV); ++ ++ /* on the console nasid, prepare to receive ++ * system controller environmental events ++ */ ++ if(scd->scd_nasid == event_nasid) { ++ scdrv_event_init(scd); ++ } ++ } ++ return 0; ++} ++device_initcall(scdrv_init); +diff --git a/drivers/gpu/drm/vmwgfx/vmwgfx_fifo.c b/drivers/gpu/drm/vmwgfx/vmwgfx_fifo.c +index e5252ef3812f..6ae6241185ea 100644 +--- a/drivers/gpu/drm/vmwgfx/vmwgfx_fifo.c ++++ b/drivers/gpu/drm/vmwgfx/vmwgfx_fifo.c +@@ -237,7 +237,7 @@ static int vmw_fifo_wait_noirq(struct vmw_private *dev_priv, + DRM_ERROR("SVGA device lockup.\n"); + break; + } +- schedule_timeout(1); ++ schedule_min_hrtimeout(); + if (interruptible && signal_pending(current)) { + ret = -ERESTARTSYS; + break; +diff --git a/drivers/gpu/drm/vmwgfx/vmwgfx_irq.c b/drivers/gpu/drm/vmwgfx/vmwgfx_irq.c +index 75f3efee21a4..09b1932ce85b 100644 +--- a/drivers/gpu/drm/vmwgfx/vmwgfx_irq.c ++++ b/drivers/gpu/drm/vmwgfx/vmwgfx_irq.c +@@ -203,7 +203,7 @@ int vmw_fallback_wait(struct vmw_private *dev_priv, + break; + } + if (lazy) +- schedule_timeout(1); ++ schedule_min_hrtimeout(); + else if ((++count & 0x0F) == 0) { + /** + * FIXME: Use schedule_hr_timeout here for +diff --git a/drivers/hwmon/fam15h_power.c b/drivers/hwmon/fam15h_power.c +index 267eac00a3fb..352af68c6cd7 100644 +--- a/drivers/hwmon/fam15h_power.c ++++ b/drivers/hwmon/fam15h_power.c +@@ -225,7 +225,7 @@ static ssize_t power1_average_show(struct device *dev, + prev_ptsc[cu] = data->cpu_sw_pwr_ptsc[cu]; + } + +- leftover = schedule_timeout_interruptible(msecs_to_jiffies(data->power_period)); ++ leftover = schedule_msec_hrtimeout_interruptible((data->power_period)); + if (leftover) + return 0; + +diff --git a/drivers/iio/light/tsl2563.c b/drivers/iio/light/tsl2563.c +index d8c40a83097d..8332baf4961c 100644 +--- a/drivers/iio/light/tsl2563.c ++++ b/drivers/iio/light/tsl2563.c +@@ -269,11 +269,7 @@ static void tsl2563_wait_adc(struct tsl2563_chip *chip) + default: + delay = 402; + } +- /* +- * TODO: Make sure that we wait at least required delay but why we +- * have to extend it one tick more? +- */ +- schedule_timeout_interruptible(msecs_to_jiffies(delay) + 2); ++ schedule_msec_hrtimeout_interruptible(delay + 1); + } + + static int tsl2563_adjust_gainlevel(struct tsl2563_chip *chip, u16 adc) +diff --git a/drivers/media/i2c/msp3400-driver.c b/drivers/media/i2c/msp3400-driver.c +index 39530d43590e..a7caf2eb5771 100644 +--- a/drivers/media/i2c/msp3400-driver.c ++++ b/drivers/media/i2c/msp3400-driver.c +@@ -170,7 +170,7 @@ static int msp_read(struct i2c_client *client, int dev, int addr) + break; + dev_warn(&client->dev, "I/O error #%d (read 0x%02x/0x%02x)\n", err, + dev, addr); +- schedule_timeout_interruptible(msecs_to_jiffies(10)); ++ schedule_msec_hrtimeout_interruptible((10)); + } + if (err == 3) { + dev_warn(&client->dev, "resetting chip, sound will go off.\n"); +@@ -211,7 +211,7 @@ static int msp_write(struct i2c_client *client, int dev, int addr, int val) + break; + dev_warn(&client->dev, "I/O error #%d (write 0x%02x/0x%02x)\n", err, + dev, addr); +- schedule_timeout_interruptible(msecs_to_jiffies(10)); ++ schedule_msec_hrtimeout_interruptible((10)); + } + if (err == 3) { + dev_warn(&client->dev, "resetting chip, sound will go off.\n"); +diff --git a/drivers/media/pci/cx18/cx18-gpio.c b/drivers/media/pci/cx18/cx18-gpio.c +index cf7cfda94107..f63e17489547 100644 +--- a/drivers/media/pci/cx18/cx18-gpio.c ++++ b/drivers/media/pci/cx18/cx18-gpio.c +@@ -81,11 +81,11 @@ static void gpio_reset_seq(struct cx18 *cx, u32 active_lo, u32 active_hi, + + /* Assert */ + gpio_update(cx, mask, ~active_lo); +- schedule_timeout_uninterruptible(msecs_to_jiffies(assert_msecs)); ++ schedule_msec_hrtimeout_uninterruptible((assert_msecs)); + + /* Deassert */ + gpio_update(cx, mask, ~active_hi); +- schedule_timeout_uninterruptible(msecs_to_jiffies(recovery_msecs)); ++ schedule_msec_hrtimeout_uninterruptible((recovery_msecs)); + } + + /* +diff --git a/drivers/media/pci/ivtv/ivtv-gpio.c b/drivers/media/pci/ivtv/ivtv-gpio.c +index 856e7ab7f33e..766a26251337 100644 +--- a/drivers/media/pci/ivtv/ivtv-gpio.c ++++ b/drivers/media/pci/ivtv/ivtv-gpio.c +@@ -105,7 +105,7 @@ void ivtv_reset_ir_gpio(struct ivtv *itv) + curout = (curout & ~0xF) | 1; + write_reg(curout, IVTV_REG_GPIO_OUT); + /* We could use something else for smaller time */ +- schedule_timeout_interruptible(msecs_to_jiffies(1)); ++ schedule_msec_hrtimeout_interruptible((1)); + curout |= 2; + write_reg(curout, IVTV_REG_GPIO_OUT); + curdir &= ~0x80; +@@ -125,11 +125,11 @@ int ivtv_reset_tuner_gpio(void *dev, int component, int cmd, int value) + curout = read_reg(IVTV_REG_GPIO_OUT); + curout &= ~(1 << itv->card->xceive_pin); + write_reg(curout, IVTV_REG_GPIO_OUT); +- schedule_timeout_interruptible(msecs_to_jiffies(1)); ++ schedule_msec_hrtimeout_interruptible((1)); + + curout |= 1 << itv->card->xceive_pin; + write_reg(curout, IVTV_REG_GPIO_OUT); +- schedule_timeout_interruptible(msecs_to_jiffies(1)); ++ schedule_msec_hrtimeout_interruptible((1)); + return 0; + } + +diff --git a/drivers/media/pci/ivtv/ivtv-ioctl.c b/drivers/media/pci/ivtv/ivtv-ioctl.c +index 137853944e46..76830892f373 100644 +--- a/drivers/media/pci/ivtv/ivtv-ioctl.c ++++ b/drivers/media/pci/ivtv/ivtv-ioctl.c +@@ -1137,7 +1137,7 @@ void ivtv_s_std_dec(struct ivtv *itv, v4l2_std_id std) + TASK_UNINTERRUPTIBLE); + if ((read_reg(IVTV_REG_DEC_LINE_FIELD) >> 16) < 100) + break; +- schedule_timeout(msecs_to_jiffies(25)); ++ schedule_msec_hrtimeout((25)); + } + finish_wait(&itv->vsync_waitq, &wait); + mutex_lock(&itv->serialize_lock); +diff --git a/drivers/media/pci/ivtv/ivtv-streams.c b/drivers/media/pci/ivtv/ivtv-streams.c +index f7de9118f609..f39ad2952c0f 100644 +--- a/drivers/media/pci/ivtv/ivtv-streams.c ++++ b/drivers/media/pci/ivtv/ivtv-streams.c +@@ -849,7 +849,7 @@ int ivtv_stop_v4l2_encode_stream(struct ivtv_stream *s, int gop_end) + while (!test_bit(IVTV_F_I_EOS, &itv->i_flags) && + time_before(jiffies, + then + msecs_to_jiffies(2000))) { +- schedule_timeout(msecs_to_jiffies(10)); ++ schedule_msec_hrtimeout((10)); + } + + /* To convert jiffies to ms, we must multiply by 1000 +diff --git a/drivers/media/radio/radio-mr800.c b/drivers/media/radio/radio-mr800.c +index cb0437b4c331..163fffc0e1d4 100644 +--- a/drivers/media/radio/radio-mr800.c ++++ b/drivers/media/radio/radio-mr800.c +@@ -366,7 +366,7 @@ static int vidioc_s_hw_freq_seek(struct file *file, void *priv, + retval = -ENODATA; + break; + } +- if (schedule_timeout_interruptible(msecs_to_jiffies(10))) { ++ if (schedule_msec_hrtimeout_interruptible((10))) { + retval = -ERESTARTSYS; + break; + } +diff --git a/drivers/media/radio/radio-tea5777.c b/drivers/media/radio/radio-tea5777.c +index fb9de7bbcd19..e53cf45e7f3f 100644 +--- a/drivers/media/radio/radio-tea5777.c ++++ b/drivers/media/radio/radio-tea5777.c +@@ -235,7 +235,7 @@ static int radio_tea5777_update_read_reg(struct radio_tea5777 *tea, int wait) + } + + if (wait) { +- if (schedule_timeout_interruptible(msecs_to_jiffies(wait))) ++ if (schedule_msec_hrtimeout_interruptible((wait))) + return -ERESTARTSYS; + } + +diff --git a/drivers/media/radio/tea575x.c b/drivers/media/radio/tea575x.c +index b0303cf00387..0925b5065147 100644 +--- a/drivers/media/radio/tea575x.c ++++ b/drivers/media/radio/tea575x.c +@@ -401,7 +401,7 @@ int snd_tea575x_s_hw_freq_seek(struct file *file, struct snd_tea575x *tea, + for (;;) { + if (time_after(jiffies, timeout)) + break; +- if (schedule_timeout_interruptible(msecs_to_jiffies(10))) { ++ if (schedule_msec_hrtimeout_interruptible((10))) { + /* some signal arrived, stop search */ + tea->val &= ~TEA575X_BIT_SEARCH; + snd_tea575x_set_freq(tea); +diff --git a/drivers/mfd/ucb1x00-core.c b/drivers/mfd/ucb1x00-core.c +index b690796d24d4..448b13da62b4 100644 +--- a/drivers/mfd/ucb1x00-core.c ++++ b/drivers/mfd/ucb1x00-core.c +@@ -250,7 +250,7 @@ unsigned int ucb1x00_adc_read(struct ucb1x00 *ucb, int adc_channel, int sync) + break; + /* yield to other processes */ + set_current_state(TASK_INTERRUPTIBLE); +- schedule_timeout(1); ++ schedule_min_hrtimeout(); + } + + return UCB_ADC_DAT(val); +diff --git a/drivers/misc/sgi-xp/xpc_channel.c b/drivers/misc/sgi-xp/xpc_channel.c +index 8e6607fc8a67..b9ab770bbdb5 100644 +--- a/drivers/misc/sgi-xp/xpc_channel.c ++++ b/drivers/misc/sgi-xp/xpc_channel.c +@@ -834,7 +834,7 @@ xpc_allocate_msg_wait(struct xpc_channel *ch) + + atomic_inc(&ch->n_on_msg_allocate_wq); + prepare_to_wait(&ch->msg_allocate_wq, &wait, TASK_INTERRUPTIBLE); +- ret = schedule_timeout(1); ++ ret = schedule_min_hrtimeout(); + finish_wait(&ch->msg_allocate_wq, &wait); + atomic_dec(&ch->n_on_msg_allocate_wq); + +diff --git a/drivers/net/caif/caif_hsi.c b/drivers/net/caif/caif_hsi.c +index bbb2575d4728..637757144221 100644 +--- a/drivers/net/caif/caif_hsi.c ++++ b/drivers/net/caif/caif_hsi.c +@@ -939,7 +939,7 @@ static void cfhsi_wake_down(struct work_struct *work) + break; + + set_current_state(TASK_INTERRUPTIBLE); +- schedule_timeout(1); ++ schedule_min_hrtimeout(); + retry--; + } + +diff --git a/drivers/net/can/usb/peak_usb/pcan_usb.c b/drivers/net/can/usb/peak_usb/pcan_usb.c +index d2539c95adb6..0c2f31a03ce9 100644 +--- a/drivers/net/can/usb/peak_usb/pcan_usb.c ++++ b/drivers/net/can/usb/peak_usb/pcan_usb.c +@@ -242,7 +242,7 @@ static int pcan_usb_write_mode(struct peak_usb_device *dev, u8 onoff) + } else { + /* the PCAN-USB needs time to init */ + set_current_state(TASK_INTERRUPTIBLE); +- schedule_timeout(msecs_to_jiffies(PCAN_USB_STARTUP_TIMEOUT)); ++ schedule_msec_hrtimeout((PCAN_USB_STARTUP_TIMEOUT)); + } + + return err; +diff --git a/drivers/net/usb/lan78xx.c b/drivers/net/usb/lan78xx.c +index f24a1b0b801f..972313b92b0a 100644 +--- a/drivers/net/usb/lan78xx.c ++++ b/drivers/net/usb/lan78xx.c +@@ -2676,7 +2676,7 @@ static void lan78xx_terminate_urbs(struct lan78xx_net *dev) + while (!skb_queue_empty(&dev->rxq) && + !skb_queue_empty(&dev->txq) && + !skb_queue_empty(&dev->done)) { +- schedule_timeout(msecs_to_jiffies(UNLINK_TIMEOUT_MS)); ++ schedule_msec_hrtimeout((UNLINK_TIMEOUT_MS)); + set_current_state(TASK_UNINTERRUPTIBLE); + netif_dbg(dev, ifdown, dev->net, + "waited for %d urb completions\n", temp); +diff --git a/drivers/net/usb/usbnet.c b/drivers/net/usb/usbnet.c +index dde05e2fdc3e..fa6c1581136e 100644 +--- a/drivers/net/usb/usbnet.c ++++ b/drivers/net/usb/usbnet.c +@@ -767,7 +767,7 @@ static void wait_skb_queue_empty(struct sk_buff_head *q) + spin_lock_irqsave(&q->lock, flags); + while (!skb_queue_empty(q)) { + spin_unlock_irqrestore(&q->lock, flags); +- schedule_timeout(msecs_to_jiffies(UNLINK_TIMEOUT_MS)); ++ schedule_msec_hrtimeout((UNLINK_TIMEOUT_MS)); + set_current_state(TASK_UNINTERRUPTIBLE); + spin_lock_irqsave(&q->lock, flags); + } +diff --git a/drivers/net/wireless/intel/ipw2x00/ipw2100.c b/drivers/net/wireless/intel/ipw2x00/ipw2100.c +index 8dfbaff2d1fe..d1d6b9777f47 100644 +--- a/drivers/net/wireless/intel/ipw2x00/ipw2100.c ++++ b/drivers/net/wireless/intel/ipw2x00/ipw2100.c +@@ -816,7 +816,7 @@ static int ipw2100_hw_send_command(struct ipw2100_priv *priv, + * doesn't seem to have as many firmware restart cycles... + * + * As a test, we're sticking in a 1/100s delay here */ +- schedule_timeout_uninterruptible(msecs_to_jiffies(10)); ++ schedule_msec_hrtimeout_uninterruptible((10)); + + return 0; + +@@ -1267,7 +1267,7 @@ static int ipw2100_start_adapter(struct ipw2100_priv *priv) + IPW_DEBUG_FW("Waiting for f/w initialization to complete...\n"); + i = 5000; + do { +- schedule_timeout_uninterruptible(msecs_to_jiffies(40)); ++ schedule_msec_hrtimeout_uninterruptible((40)); + /* Todo... wait for sync command ... */ + + read_register(priv->net_dev, IPW_REG_INTA, &inta); +diff --git a/drivers/parport/ieee1284.c b/drivers/parport/ieee1284.c +index 90fb73575495..c94048b048a5 100644 +--- a/drivers/parport/ieee1284.c ++++ b/drivers/parport/ieee1284.c +@@ -208,7 +208,7 @@ int parport_wait_peripheral(struct parport *port, + /* parport_wait_event didn't time out, but the + * peripheral wasn't actually ready either. + * Wait for another 10ms. */ +- schedule_timeout_interruptible(msecs_to_jiffies(10)); ++ schedule_msec_hrtimeout_interruptible((10)); + } + } + +diff --git a/drivers/parport/ieee1284_ops.c b/drivers/parport/ieee1284_ops.c +index 5d41dda6da4e..34705f6b423f 100644 +--- a/drivers/parport/ieee1284_ops.c ++++ b/drivers/parport/ieee1284_ops.c +@@ -537,7 +537,7 @@ size_t parport_ieee1284_ecp_read_data (struct parport *port, + /* Yield the port for a while. */ + if (count && dev->port->irq != PARPORT_IRQ_NONE) { + parport_release (dev); +- schedule_timeout_interruptible(msecs_to_jiffies(40)); ++ schedule_msec_hrtimeout_interruptible((40)); + parport_claim_or_block (dev); + } + else +diff --git a/drivers/platform/x86/intel_ips.c b/drivers/platform/x86/intel_ips.c +index bffe548187ee..c2918ee3e100 100644 +--- a/drivers/platform/x86/intel_ips.c ++++ b/drivers/platform/x86/intel_ips.c +@@ -798,7 +798,7 @@ static int ips_adjust(void *data) + ips_gpu_lower(ips); + + sleep: +- schedule_timeout_interruptible(msecs_to_jiffies(IPS_ADJUST_PERIOD)); ++ schedule_msec_hrtimeout_interruptible((IPS_ADJUST_PERIOD)); + } while (!kthread_should_stop()); + + dev_dbg(ips->dev, "ips-adjust thread stopped\n"); +@@ -974,7 +974,7 @@ static int ips_monitor(void *data) + seqno_timestamp = get_jiffies_64(); + + old_cpu_power = thm_readl(THM_CEC); +- schedule_timeout_interruptible(msecs_to_jiffies(IPS_SAMPLE_PERIOD)); ++ schedule_msec_hrtimeout_interruptible((IPS_SAMPLE_PERIOD)); + + /* Collect an initial average */ + for (i = 0; i < IPS_SAMPLE_COUNT; i++) { +@@ -1001,7 +1001,7 @@ static int ips_monitor(void *data) + mchp_samples[i] = mchp; + } + +- schedule_timeout_interruptible(msecs_to_jiffies(IPS_SAMPLE_PERIOD)); ++ schedule_msec_hrtimeout_interruptible((IPS_SAMPLE_PERIOD)); + if (kthread_should_stop()) + break; + } +@@ -1028,7 +1028,7 @@ static int ips_monitor(void *data) + * us to reduce the sample frequency if the CPU and GPU are idle. + */ + old_cpu_power = thm_readl(THM_CEC); +- schedule_timeout_interruptible(msecs_to_jiffies(IPS_SAMPLE_PERIOD)); ++ schedule_msec_hrtimeout_interruptible((IPS_SAMPLE_PERIOD)); + last_sample_period = IPS_SAMPLE_PERIOD; + + timer_setup(&ips->timer, monitor_timeout, TIMER_DEFERRABLE); +diff --git a/drivers/rtc/rtc-wm8350.c b/drivers/rtc/rtc-wm8350.c +index 2018614f258f..fc19b312c345 100644 +--- a/drivers/rtc/rtc-wm8350.c ++++ b/drivers/rtc/rtc-wm8350.c +@@ -114,7 +114,7 @@ static int wm8350_rtc_settime(struct device *dev, struct rtc_time *tm) + /* Wait until confirmation of stopping */ + do { + rtc_ctrl = wm8350_reg_read(wm8350, WM8350_RTC_TIME_CONTROL); +- schedule_timeout_uninterruptible(msecs_to_jiffies(1)); ++ schedule_msec_hrtimeout_uninterruptible((1)); + } while (--retries && !(rtc_ctrl & WM8350_RTC_STS)); + + if (!retries) { +@@ -197,7 +197,7 @@ static int wm8350_rtc_stop_alarm(struct wm8350 *wm8350) + /* Wait until confirmation of stopping */ + do { + rtc_ctrl = wm8350_reg_read(wm8350, WM8350_RTC_TIME_CONTROL); +- schedule_timeout_uninterruptible(msecs_to_jiffies(1)); ++ schedule_msec_hrtimeout_uninterruptible((1)); + } while (retries-- && !(rtc_ctrl & WM8350_RTC_ALMSTS)); + + if (!(rtc_ctrl & WM8350_RTC_ALMSTS)) +@@ -220,7 +220,7 @@ static int wm8350_rtc_start_alarm(struct wm8350 *wm8350) + /* Wait until confirmation */ + do { + rtc_ctrl = wm8350_reg_read(wm8350, WM8350_RTC_TIME_CONTROL); +- schedule_timeout_uninterruptible(msecs_to_jiffies(1)); ++ schedule_msec_hrtimeout_uninterruptible((1)); + } while (retries-- && rtc_ctrl & WM8350_RTC_ALMSTS); + + if (rtc_ctrl & WM8350_RTC_ALMSTS) +diff --git a/drivers/scsi/fnic/fnic_scsi.c b/drivers/scsi/fnic/fnic_scsi.c +index 80608b53897b..84051b538fa8 100644 +--- a/drivers/scsi/fnic/fnic_scsi.c ++++ b/drivers/scsi/fnic/fnic_scsi.c +@@ -216,7 +216,7 @@ int fnic_fw_reset_handler(struct fnic *fnic) + + /* wait for io cmpl */ + while (atomic_read(&fnic->in_flight)) +- schedule_timeout(msecs_to_jiffies(1)); ++ schedule_msec_hrtimeout((1)); + + spin_lock_irqsave(&fnic->wq_copy_lock[0], flags); + +@@ -2273,7 +2273,7 @@ static int fnic_clean_pending_aborts(struct fnic *fnic, + } + } + +- schedule_timeout(msecs_to_jiffies(2 * fnic->config.ed_tov)); ++ schedule_msec_hrtimeout((2 * fnic->config.ed_tov)); + + /* walk again to check, if IOs are still pending in fw */ + if (fnic_is_abts_pending(fnic, lr_sc)) +diff --git a/drivers/scsi/lpfc/lpfc_scsi.c b/drivers/scsi/lpfc/lpfc_scsi.c +index 6822cd9ff8f1..ac3ad534be1a 100644 +--- a/drivers/scsi/lpfc/lpfc_scsi.c ++++ b/drivers/scsi/lpfc/lpfc_scsi.c +@@ -5176,7 +5176,7 @@ lpfc_reset_flush_io_context(struct lpfc_vport *vport, uint16_t tgt_id, + tgt_id, lun_id, context); + later = msecs_to_jiffies(2 * vport->cfg_devloss_tmo * 1000) + jiffies; + while (time_after(later, jiffies) && cnt) { +- schedule_timeout_uninterruptible(msecs_to_jiffies(20)); ++ schedule_msec_hrtimeout_uninterruptible((20)); + cnt = lpfc_sli_sum_iocb(vport, tgt_id, lun_id, context); + } + if (cnt) { +diff --git a/drivers/scsi/snic/snic_scsi.c b/drivers/scsi/snic/snic_scsi.c +index b3650c989ed4..7ed1fb285754 100644 +--- a/drivers/scsi/snic/snic_scsi.c ++++ b/drivers/scsi/snic/snic_scsi.c +@@ -2353,7 +2353,7 @@ snic_reset(struct Scsi_Host *shost, struct scsi_cmnd *sc) + + /* Wait for all the IOs that are entered in Qcmd */ + while (atomic_read(&snic->ios_inflight)) +- schedule_timeout(msecs_to_jiffies(1)); ++ schedule_msec_hrtimeout((1)); + + ret = snic_issue_hba_reset(snic, sc); + if (ret) { +diff --git a/drivers/staging/comedi/drivers/ni_mio_common.c b/drivers/staging/comedi/drivers/ni_mio_common.c +index f98e3ae27bff..0741c8352a6d 100644 +--- a/drivers/staging/comedi/drivers/ni_mio_common.c ++++ b/drivers/staging/comedi/drivers/ni_mio_common.c +@@ -4742,7 +4742,7 @@ static int cs5529_wait_for_idle(struct comedi_device *dev) + if ((status & NI67XX_CAL_STATUS_BUSY) == 0) + break; + set_current_state(TASK_INTERRUPTIBLE); +- if (schedule_timeout(1)) ++ if (schedule_min_hrtimeout()) + return -EIO; + } + if (i == timeout) { +diff --git a/drivers/staging/lustre/lnet/lnet/lib-eq.c b/drivers/staging/lustre/lnet/lnet/lib-eq.c +new file mode 100644 +index 000000000000..8cca151741b2 +--- /dev/null ++++ b/drivers/staging/lustre/lnet/lnet/lib-eq.c +@@ -0,0 +1,426 @@ ++// SPDX-License-Identifier: GPL-2.0 ++/* ++ * GPL HEADER START ++ * ++ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. ++ * ++ * This program is free software; you can redistribute it and/or modify ++ * it under the terms of the GNU General Public License version 2 only, ++ * as published by the Free Software Foundation. ++ * ++ * This program is distributed in the hope that it will be useful, but ++ * WITHOUT ANY WARRANTY; without even the implied warranty of ++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ++ * General Public License version 2 for more details (a copy is included ++ * in the LICENSE file that accompanied this code). ++ * ++ * You should have received a copy of the GNU General Public License ++ * version 2 along with this program; If not, see ++ * http://www.gnu.org/licenses/gpl-2.0.html ++ * ++ * GPL HEADER END ++ */ ++/* ++ * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved. ++ * Use is subject to license terms. ++ * ++ * Copyright (c) 2012, Intel Corporation. ++ */ ++/* ++ * This file is part of Lustre, http://www.lustre.org/ ++ * Lustre is a trademark of Sun Microsystems, Inc. ++ * ++ * lnet/lnet/lib-eq.c ++ * ++ * Library level Event queue management routines ++ */ ++ ++#define DEBUG_SUBSYSTEM S_LNET ++ ++#include ++ ++/** ++ * Create an event queue that has room for \a count number of events. ++ * ++ * The event queue is circular and older events will be overwritten by new ++ * ones if they are not removed in time by the user using the functions ++ * LNetEQGet(), LNetEQWait(), or LNetEQPoll(). It is up to the user to ++ * determine the appropriate size of the event queue to prevent this loss ++ * of events. Note that when EQ handler is specified in \a callback, no ++ * event loss can happen, since the handler is run for each event deposited ++ * into the EQ. ++ * ++ * \param count The number of events to be stored in the event queue. It ++ * will be rounded up to the next power of two. ++ * \param callback A handler function that runs when an event is deposited ++ * into the EQ. The constant value LNET_EQ_HANDLER_NONE can be used to ++ * indicate that no event handler is desired. ++ * \param handle On successful return, this location will hold a handle for ++ * the newly created EQ. ++ * ++ * \retval 0 On success. ++ * \retval -EINVAL If an parameter is not valid. ++ * \retval -ENOMEM If memory for the EQ can't be allocated. ++ * ++ * \see lnet_eq_handler_t for the discussion on EQ handler semantics. ++ */ ++int ++LNetEQAlloc(unsigned int count, lnet_eq_handler_t callback, ++ struct lnet_handle_eq *handle) ++{ ++ struct lnet_eq *eq; ++ ++ LASSERT(the_lnet.ln_refcount > 0); ++ ++ /* ++ * We need count to be a power of 2 so that when eq_{enq,deq}_seq ++ * overflow, they don't skip entries, so the queue has the same ++ * apparent capacity at all times ++ */ ++ if (count) ++ count = roundup_pow_of_two(count); ++ ++ if (callback != LNET_EQ_HANDLER_NONE && count) ++ CWARN("EQ callback is guaranteed to get every event, do you still want to set eqcount %d for polling event which will have locking overhead? Please contact with developer to confirm\n", count); ++ ++ /* ++ * count can be 0 if only need callback, we can eliminate ++ * overhead of enqueue event ++ */ ++ if (!count && callback == LNET_EQ_HANDLER_NONE) ++ return -EINVAL; ++ ++ eq = kzalloc(sizeof(*eq), GFP_NOFS); ++ if (!eq) ++ return -ENOMEM; ++ ++ if (count) { ++ eq->eq_events = kvmalloc_array(count, sizeof(struct lnet_event), ++ GFP_KERNEL | __GFP_ZERO); ++ if (!eq->eq_events) ++ goto failed; ++ /* ++ * NB allocator has set all event sequence numbers to 0, ++ * so all them should be earlier than eq_deq_seq ++ */ ++ } ++ ++ eq->eq_deq_seq = 1; ++ eq->eq_enq_seq = 1; ++ eq->eq_size = count; ++ eq->eq_callback = callback; ++ ++ eq->eq_refs = cfs_percpt_alloc(lnet_cpt_table(), ++ sizeof(*eq->eq_refs[0])); ++ if (!eq->eq_refs) ++ goto failed; ++ ++ /* MUST hold both exclusive lnet_res_lock */ ++ lnet_res_lock(LNET_LOCK_EX); ++ /* ++ * NB: hold lnet_eq_wait_lock for EQ link/unlink, so we can do ++ * both EQ lookup and poll event with only lnet_eq_wait_lock ++ */ ++ lnet_eq_wait_lock(); ++ ++ lnet_res_lh_initialize(&the_lnet.ln_eq_container, &eq->eq_lh); ++ list_add(&eq->eq_list, &the_lnet.ln_eq_container.rec_active); ++ ++ lnet_eq_wait_unlock(); ++ lnet_res_unlock(LNET_LOCK_EX); ++ ++ lnet_eq2handle(handle, eq); ++ return 0; ++ ++failed: ++ kvfree(eq->eq_events); ++ ++ if (eq->eq_refs) ++ cfs_percpt_free(eq->eq_refs); ++ ++ kfree(eq); ++ return -ENOMEM; ++} ++EXPORT_SYMBOL(LNetEQAlloc); ++ ++/** ++ * Release the resources associated with an event queue if it's idle; ++ * otherwise do nothing and it's up to the user to try again. ++ * ++ * \param eqh A handle for the event queue to be released. ++ * ++ * \retval 0 If the EQ is not in use and freed. ++ * \retval -ENOENT If \a eqh does not point to a valid EQ. ++ * \retval -EBUSY If the EQ is still in use by some MDs. ++ */ ++int ++LNetEQFree(struct lnet_handle_eq eqh) ++{ ++ struct lnet_eq *eq; ++ struct lnet_event *events = NULL; ++ int **refs = NULL; ++ int *ref; ++ int rc = 0; ++ int size = 0; ++ int i; ++ ++ LASSERT(the_lnet.ln_refcount > 0); ++ ++ lnet_res_lock(LNET_LOCK_EX); ++ /* ++ * NB: hold lnet_eq_wait_lock for EQ link/unlink, so we can do ++ * both EQ lookup and poll event with only lnet_eq_wait_lock ++ */ ++ lnet_eq_wait_lock(); ++ ++ eq = lnet_handle2eq(&eqh); ++ if (!eq) { ++ rc = -ENOENT; ++ goto out; ++ } ++ ++ cfs_percpt_for_each(ref, i, eq->eq_refs) { ++ LASSERT(*ref >= 0); ++ if (!*ref) ++ continue; ++ ++ CDEBUG(D_NET, "Event equeue (%d: %d) busy on destroy.\n", ++ i, *ref); ++ rc = -EBUSY; ++ goto out; ++ } ++ ++ /* stash for free after lock dropped */ ++ events = eq->eq_events; ++ size = eq->eq_size; ++ refs = eq->eq_refs; ++ ++ lnet_res_lh_invalidate(&eq->eq_lh); ++ list_del(&eq->eq_list); ++ kfree(eq); ++ out: ++ lnet_eq_wait_unlock(); ++ lnet_res_unlock(LNET_LOCK_EX); ++ ++ kvfree(events); ++ if (refs) ++ cfs_percpt_free(refs); ++ ++ return rc; ++} ++EXPORT_SYMBOL(LNetEQFree); ++ ++void ++lnet_eq_enqueue_event(struct lnet_eq *eq, struct lnet_event *ev) ++{ ++ /* MUST called with resource lock hold but w/o lnet_eq_wait_lock */ ++ int index; ++ ++ if (!eq->eq_size) { ++ LASSERT(eq->eq_callback != LNET_EQ_HANDLER_NONE); ++ eq->eq_callback(ev); ++ return; ++ } ++ ++ lnet_eq_wait_lock(); ++ ev->sequence = eq->eq_enq_seq++; ++ ++ LASSERT(eq->eq_size == LOWEST_BIT_SET(eq->eq_size)); ++ index = ev->sequence & (eq->eq_size - 1); ++ ++ eq->eq_events[index] = *ev; ++ ++ if (eq->eq_callback != LNET_EQ_HANDLER_NONE) ++ eq->eq_callback(ev); ++ ++ /* Wake anyone waiting in LNetEQPoll() */ ++ if (waitqueue_active(&the_lnet.ln_eq_waitq)) ++ wake_up_all(&the_lnet.ln_eq_waitq); ++ lnet_eq_wait_unlock(); ++} ++ ++static int ++lnet_eq_dequeue_event(struct lnet_eq *eq, struct lnet_event *ev) ++{ ++ int new_index = eq->eq_deq_seq & (eq->eq_size - 1); ++ struct lnet_event *new_event = &eq->eq_events[new_index]; ++ int rc; ++ ++ /* must called with lnet_eq_wait_lock hold */ ++ if (LNET_SEQ_GT(eq->eq_deq_seq, new_event->sequence)) ++ return 0; ++ ++ /* We've got a new event... */ ++ *ev = *new_event; ++ ++ CDEBUG(D_INFO, "event: %p, sequence: %lu, eq->size: %u\n", ++ new_event, eq->eq_deq_seq, eq->eq_size); ++ ++ /* ...but did it overwrite an event we've not seen yet? */ ++ if (eq->eq_deq_seq == new_event->sequence) { ++ rc = 1; ++ } else { ++ /* ++ * don't complain with CERROR: some EQs are sized small ++ * anyway; if it's important, the caller should complain ++ */ ++ CDEBUG(D_NET, "Event Queue Overflow: eq seq %lu ev seq %lu\n", ++ eq->eq_deq_seq, new_event->sequence); ++ rc = -EOVERFLOW; ++ } ++ ++ eq->eq_deq_seq = new_event->sequence + 1; ++ return rc; ++} ++ ++/** ++ * A nonblocking function that can be used to get the next event in an EQ. ++ * If an event handler is associated with the EQ, the handler will run before ++ * this function returns successfully. The event is removed from the queue. ++ * ++ * \param eventq A handle for the event queue. ++ * \param event On successful return (1 or -EOVERFLOW), this location will ++ * hold the next event in the EQ. ++ * ++ * \retval 0 No pending event in the EQ. ++ * \retval 1 Indicates success. ++ * \retval -ENOENT If \a eventq does not point to a valid EQ. ++ * \retval -EOVERFLOW Indicates success (i.e., an event is returned) and that ++ * at least one event between this event and the last event obtained from the ++ * EQ has been dropped due to limited space in the EQ. ++ */ ++ ++/** ++ * Block the calling process until there is an event in the EQ. ++ * If an event handler is associated with the EQ, the handler will run before ++ * this function returns successfully. This function returns the next event ++ * in the EQ and removes it from the EQ. ++ * ++ * \param eventq A handle for the event queue. ++ * \param event On successful return (1 or -EOVERFLOW), this location will ++ * hold the next event in the EQ. ++ * ++ * \retval 1 Indicates success. ++ * \retval -ENOENT If \a eventq does not point to a valid EQ. ++ * \retval -EOVERFLOW Indicates success (i.e., an event is returned) and that ++ * at least one event between this event and the last event obtained from the ++ * EQ has been dropped due to limited space in the EQ. ++ */ ++ ++static int ++lnet_eq_wait_locked(int *timeout_ms, long state) ++__must_hold(&the_lnet.ln_eq_wait_lock) ++{ ++ int tms = *timeout_ms; ++ int wait; ++ wait_queue_entry_t wl; ++ unsigned long now; ++ ++ if (!tms) ++ return -ENXIO; /* don't want to wait and no new event */ ++ ++ init_waitqueue_entry(&wl, current); ++ set_current_state(state); ++ add_wait_queue(&the_lnet.ln_eq_waitq, &wl); ++ ++ lnet_eq_wait_unlock(); ++ ++ if (tms < 0) { ++ schedule(); ++ } else { ++ now = jiffies; ++ schedule_msec_hrtimeout((tms)); ++ tms -= jiffies_to_msecs(jiffies - now); ++ if (tms < 0) /* no more wait but may have new event */ ++ tms = 0; ++ } ++ ++ wait = tms; /* might need to call here again */ ++ *timeout_ms = tms; ++ ++ lnet_eq_wait_lock(); ++ remove_wait_queue(&the_lnet.ln_eq_waitq, &wl); ++ ++ return wait; ++} ++ ++/** ++ * Block the calling process until there's an event from a set of EQs or ++ * timeout happens. ++ * ++ * If an event handler is associated with the EQ, the handler will run before ++ * this function returns successfully, in which case the corresponding event ++ * is consumed. ++ * ++ * LNetEQPoll() provides a timeout to allow applications to poll, block for a ++ * fixed period, or block indefinitely. ++ * ++ * \param eventqs,neq An array of EQ handles, and size of the array. ++ * \param timeout_ms Time in milliseconds to wait for an event to occur on ++ * one of the EQs. The constant LNET_TIME_FOREVER can be used to indicate an ++ * infinite timeout. ++ * \param interruptible, if true, use TASK_INTERRUPTIBLE, else TASK_NOLOAD ++ * \param event,which On successful return (1 or -EOVERFLOW), \a event will ++ * hold the next event in the EQs, and \a which will contain the index of the ++ * EQ from which the event was taken. ++ * ++ * \retval 0 No pending event in the EQs after timeout. ++ * \retval 1 Indicates success. ++ * \retval -EOVERFLOW Indicates success (i.e., an event is returned) and that ++ * at least one event between this event and the last event obtained from the ++ * EQ indicated by \a which has been dropped due to limited space in the EQ. ++ * \retval -ENOENT If there's an invalid handle in \a eventqs. ++ */ ++int ++LNetEQPoll(struct lnet_handle_eq *eventqs, int neq, int timeout_ms, ++ int interruptible, ++ struct lnet_event *event, int *which) ++{ ++ int wait = 1; ++ int rc; ++ int i; ++ ++ LASSERT(the_lnet.ln_refcount > 0); ++ ++ if (neq < 1) ++ return -ENOENT; ++ ++ lnet_eq_wait_lock(); ++ ++ for (;;) { ++ for (i = 0; i < neq; i++) { ++ struct lnet_eq *eq = lnet_handle2eq(&eventqs[i]); ++ ++ if (!eq) { ++ lnet_eq_wait_unlock(); ++ return -ENOENT; ++ } ++ ++ rc = lnet_eq_dequeue_event(eq, event); ++ if (rc) { ++ lnet_eq_wait_unlock(); ++ *which = i; ++ return rc; ++ } ++ } ++ ++ if (!wait) ++ break; ++ ++ /* ++ * return value of lnet_eq_wait_locked: ++ * -1 : did nothing and it's sure no new event ++ * 1 : sleep inside and wait until new event ++ * 0 : don't want to wait anymore, but might have new event ++ * so need to call dequeue again ++ */ ++ wait = lnet_eq_wait_locked(&timeout_ms, ++ interruptible ? TASK_INTERRUPTIBLE ++ : TASK_NOLOAD); ++ if (wait < 0) /* no new event */ ++ break; ++ } ++ ++ lnet_eq_wait_unlock(); ++ return 0; ++} +diff --git a/drivers/staging/rts5208/rtsx.c b/drivers/staging/rts5208/rtsx.c +index fa597953e9a0..685cf842badc 100644 +--- a/drivers/staging/rts5208/rtsx.c ++++ b/drivers/staging/rts5208/rtsx.c +@@ -490,7 +490,7 @@ static int rtsx_polling_thread(void *__dev) + + for (;;) { + set_current_state(TASK_INTERRUPTIBLE); +- schedule_timeout(msecs_to_jiffies(POLLING_INTERVAL)); ++ schedule_msec_hrtimeout((POLLING_INTERVAL)); + + /* lock the device pointers */ + mutex_lock(&dev->dev_mutex); +diff --git a/drivers/staging/speakup/speakup_acntpc.c b/drivers/staging/speakup/speakup_acntpc.c +index c94328a5bd4a..6e7d4671aa69 100644 +--- a/drivers/staging/speakup/speakup_acntpc.c ++++ b/drivers/staging/speakup/speakup_acntpc.c +@@ -198,7 +198,7 @@ static void do_catch_up(struct spk_synth *synth) + full_time_val = full_time->u.n.value; + spin_unlock_irqrestore(&speakup_info.spinlock, flags); + if (synth_full()) { +- schedule_timeout(msecs_to_jiffies(full_time_val)); ++ schedule_msec_hrtimeout((full_time_val)); + continue; + } + set_current_state(TASK_RUNNING); +@@ -226,7 +226,7 @@ static void do_catch_up(struct spk_synth *synth) + jiffy_delta_val = jiffy_delta->u.n.value; + delay_time_val = delay_time->u.n.value; + spin_unlock_irqrestore(&speakup_info.spinlock, flags); +- schedule_timeout(msecs_to_jiffies(delay_time_val)); ++ schedule_msec_hrtimeout(delay_time_val); + jiff_max = jiffies + jiffy_delta_val; + } + } +diff --git a/drivers/staging/speakup/speakup_apollo.c b/drivers/staging/speakup/speakup_apollo.c +index 0877b4044c28..627102d048c1 100644 +--- a/drivers/staging/speakup/speakup_apollo.c ++++ b/drivers/staging/speakup/speakup_apollo.c +@@ -165,7 +165,7 @@ static void do_catch_up(struct spk_synth *synth) + if (!synth->io_ops->synth_out(synth, ch)) { + synth->io_ops->tiocmset(0, UART_MCR_RTS); + synth->io_ops->tiocmset(UART_MCR_RTS, 0); +- schedule_timeout(msecs_to_jiffies(full_time_val)); ++ schedule_msec_hrtimeout(full_time_val); + continue; + } + if (time_after_eq(jiffies, jiff_max) && (ch == SPACE)) { +diff --git a/drivers/staging/speakup/speakup_decext.c b/drivers/staging/speakup/speakup_decext.c +index ddbb7e97d118..f9502addc765 100644 +--- a/drivers/staging/speakup/speakup_decext.c ++++ b/drivers/staging/speakup/speakup_decext.c +@@ -176,7 +176,7 @@ static void do_catch_up(struct spk_synth *synth) + if (ch == '\n') + ch = 0x0D; + if (synth_full() || !synth->io_ops->synth_out(synth, ch)) { +- schedule_timeout(msecs_to_jiffies(delay_time_val)); ++ schedule_msec_hrtimeout(delay_time_val); + continue; + } + set_current_state(TASK_RUNNING); +diff --git a/drivers/staging/speakup/speakup_decpc.c b/drivers/staging/speakup/speakup_decpc.c +index 798c42dfa16c..d85b41db67a3 100644 +--- a/drivers/staging/speakup/speakup_decpc.c ++++ b/drivers/staging/speakup/speakup_decpc.c +@@ -394,7 +394,7 @@ static void do_catch_up(struct spk_synth *synth) + if (ch == '\n') + ch = 0x0D; + if (dt_sendchar(ch)) { +- schedule_timeout(msecs_to_jiffies(delay_time_val)); ++ schedule_msec_hrtimeout((delay_time_val)); + continue; + } + set_current_state(TASK_RUNNING); +diff --git a/drivers/staging/speakup/speakup_dectlk.c b/drivers/staging/speakup/speakup_dectlk.c +index dccb4ea29d37..8ecead307d04 100644 +--- a/drivers/staging/speakup/speakup_dectlk.c ++++ b/drivers/staging/speakup/speakup_dectlk.c +@@ -244,7 +244,7 @@ static void do_catch_up(struct spk_synth *synth) + if (ch == '\n') + ch = 0x0D; + if (synth_full_val || !synth->io_ops->synth_out(synth, ch)) { +- schedule_timeout(msecs_to_jiffies(delay_time_val)); ++ schedule_msec_hrtimeout(delay_time_val); + continue; + } + set_current_state(TASK_RUNNING); +diff --git a/drivers/staging/speakup/speakup_dtlk.c b/drivers/staging/speakup/speakup_dtlk.c +index dbebed0eeeec..6d83c13ca4a6 100644 +--- a/drivers/staging/speakup/speakup_dtlk.c ++++ b/drivers/staging/speakup/speakup_dtlk.c +@@ -211,7 +211,7 @@ static void do_catch_up(struct spk_synth *synth) + delay_time_val = delay_time->u.n.value; + spin_unlock_irqrestore(&speakup_info.spinlock, flags); + if (synth_full()) { +- schedule_timeout(msecs_to_jiffies(delay_time_val)); ++ schedule_msec_hrtimeout((delay_time_val)); + continue; + } + set_current_state(TASK_RUNNING); +@@ -227,7 +227,7 @@ static void do_catch_up(struct spk_synth *synth) + delay_time_val = delay_time->u.n.value; + jiffy_delta_val = jiffy_delta->u.n.value; + spin_unlock_irqrestore(&speakup_info.spinlock, flags); +- schedule_timeout(msecs_to_jiffies(delay_time_val)); ++ schedule_msec_hrtimeout((delay_time_val)); + jiff_max = jiffies + jiffy_delta_val; + } + } +diff --git a/drivers/staging/speakup/speakup_keypc.c b/drivers/staging/speakup/speakup_keypc.c +index 414827e888fc..cb31c9176daa 100644 +--- a/drivers/staging/speakup/speakup_keypc.c ++++ b/drivers/staging/speakup/speakup_keypc.c +@@ -199,7 +199,7 @@ static void do_catch_up(struct spk_synth *synth) + full_time_val = full_time->u.n.value; + spin_unlock_irqrestore(&speakup_info.spinlock, flags); + if (synth_full()) { +- schedule_timeout(msecs_to_jiffies(full_time_val)); ++ schedule_msec_hrtimeout((full_time_val)); + continue; + } + set_current_state(TASK_RUNNING); +@@ -232,7 +232,7 @@ static void do_catch_up(struct spk_synth *synth) + jiffy_delta_val = jiffy_delta->u.n.value; + delay_time_val = delay_time->u.n.value; + spin_unlock_irqrestore(&speakup_info.spinlock, flags); +- schedule_timeout(msecs_to_jiffies(delay_time_val)); ++ schedule_msec_hrtimeout(delay_time_val); + jiff_max = jiffies + jiffy_delta_val; + } + } +diff --git a/drivers/staging/speakup/synth.c b/drivers/staging/speakup/synth.c +index 3568bfb89912..0a80b3b098b2 100644 +--- a/drivers/staging/speakup/synth.c ++++ b/drivers/staging/speakup/synth.c +@@ -93,12 +93,8 @@ static void _spk_do_catch_up(struct spk_synth *synth, int unicode) + spin_unlock_irqrestore(&speakup_info.spinlock, flags); + if (ch == '\n') + ch = synth->procspeech; +- if (unicode) +- ret = synth->io_ops->synth_out_unicode(synth, ch); +- else +- ret = synth->io_ops->synth_out(synth, ch); +- if (!ret) { +- schedule_timeout(msecs_to_jiffies(full_time_val)); ++ if (!synth->io_ops->synth_out(synth, ch)) { ++ schedule_msec_hrtimeout(full_time_val); + continue; + } + if (time_after_eq(jiffies, jiff_max) && (ch == SPACE)) { +@@ -108,11 +104,9 @@ static void _spk_do_catch_up(struct spk_synth *synth, int unicode) + full_time_val = full_time->u.n.value; + spin_unlock_irqrestore(&speakup_info.spinlock, flags); + if (synth->io_ops->synth_out(synth, synth->procspeech)) +- schedule_timeout( +- msecs_to_jiffies(delay_time_val)); ++ schedule_msec_hrtimeout(delay_time_val); + else +- schedule_timeout( +- msecs_to_jiffies(full_time_val)); ++ schedule_msec_hrtimeout(full_time_val); + jiff_max = jiffies + jiffy_delta_val; + } + set_current_state(TASK_RUNNING); +diff --git a/drivers/staging/unisys/visornic/visornic_main.c b/drivers/staging/unisys/visornic/visornic_main.c +index 1d1440d43002..52fe89ae1d9d 100644 +--- a/drivers/staging/unisys/visornic/visornic_main.c ++++ b/drivers/staging/unisys/visornic/visornic_main.c +@@ -549,7 +549,7 @@ static int visornic_disable_with_timeout(struct net_device *netdev, + } + set_current_state(TASK_INTERRUPTIBLE); + spin_unlock_irqrestore(&devdata->priv_lock, flags); +- wait += schedule_timeout(msecs_to_jiffies(10)); ++ wait += schedule_msec_hrtimeout((10)); + spin_lock_irqsave(&devdata->priv_lock, flags); + } + +@@ -560,7 +560,7 @@ static int visornic_disable_with_timeout(struct net_device *netdev, + while (1) { + set_current_state(TASK_INTERRUPTIBLE); + spin_unlock_irqrestore(&devdata->priv_lock, flags); +- schedule_timeout(msecs_to_jiffies(10)); ++ schedule_msec_hrtimeout((10)); + spin_lock_irqsave(&devdata->priv_lock, flags); + if (atomic_read(&devdata->usage)) + break; +@@ -714,7 +714,7 @@ static int visornic_enable_with_timeout(struct net_device *netdev, + } + set_current_state(TASK_INTERRUPTIBLE); + spin_unlock_irqrestore(&devdata->priv_lock, flags); +- wait += schedule_timeout(msecs_to_jiffies(10)); ++ wait += schedule_msec_hrtimeout((10)); + spin_lock_irqsave(&devdata->priv_lock, flags); + } + +diff --git a/drivers/video/fbdev/omap/hwa742.c b/drivers/video/fbdev/omap/hwa742.c +index cfe63932f825..71c00ef772a3 100644 +--- a/drivers/video/fbdev/omap/hwa742.c ++++ b/drivers/video/fbdev/omap/hwa742.c +@@ -913,7 +913,7 @@ static void hwa742_resume(void) + if (hwa742_read_reg(HWA742_PLL_DIV_REG) & (1 << 7)) + break; + set_current_state(TASK_UNINTERRUPTIBLE); +- schedule_timeout(msecs_to_jiffies(5)); ++ schedule_msec_hrtimeout((5)); + } + hwa742_set_update_mode(hwa742.update_mode_before_suspend); + } +diff --git a/drivers/video/fbdev/pxafb.c b/drivers/video/fbdev/pxafb.c +index f70c9f79622e..0b363eaee24f 100644 +--- a/drivers/video/fbdev/pxafb.c ++++ b/drivers/video/fbdev/pxafb.c +@@ -1287,7 +1287,7 @@ static int pxafb_smart_thread(void *arg) + mutex_unlock(&fbi->ctrlr_lock); + + set_current_state(TASK_INTERRUPTIBLE); +- schedule_timeout(msecs_to_jiffies(30)); ++ schedule_msec_hrtimeout((30)); + } + + pr_debug("%s(): task ending\n", __func__); +diff --git a/fs/btrfs/inode-map.c b/fs/btrfs/inode-map.c +index 37345fb6191d..3874c17d1bc5 100644 +--- a/fs/btrfs/inode-map.c ++++ b/fs/btrfs/inode-map.c +@@ -91,7 +91,7 @@ static int caching_kthread(void *data) + btrfs_release_path(path); + root->ino_cache_progress = last; + up_read(&fs_info->commit_root_sem); +- schedule_timeout(1); ++ schedule_min_hrtimeout(); + goto again; + } else + continue; +diff --git a/fs/proc/base.c b/fs/proc/base.c +index ebea9501afb8..51c9346a69fe 100644 +--- a/fs/proc/base.c ++++ b/fs/proc/base.c +@@ -477,7 +477,7 @@ static int proc_pid_schedstat(struct seq_file *m, struct pid_namespace *ns, + seq_puts(m, "0 0 0\n"); + else + seq_printf(m, "%llu %llu %lu\n", +- (unsigned long long)task->se.sum_exec_runtime, ++ (unsigned long long)tsk_seruntime(task), + (unsigned long long)task->sched_info.run_delay, + task->sched_info.pcount); + +diff --git a/include/linux/freezer.h b/include/linux/freezer.h +index 21f5aa0b217f..ee9b46394fdf 100644 +--- a/include/linux/freezer.h ++++ b/include/linux/freezer.h +@@ -297,6 +297,7 @@ static inline void set_freezable(void) {} + #define wait_event_freezekillable_unsafe(wq, condition) \ + wait_event_killable(wq, condition) + ++#define pm_freezing (false) + #endif /* !CONFIG_FREEZER */ + + #endif /* FREEZER_H_INCLUDED */ +diff --git a/include/linux/init_task.h b/include/linux/init_task.h +index 2c620d7ac432..73417df5daa2 100644 +--- a/include/linux/init_task.h ++++ b/include/linux/init_task.h +@@ -36,7 +36,11 @@ extern struct cred init_cred; + #define INIT_PREV_CPUTIME(x) + #endif + ++#ifdef CONFIG_SCHED_MUQSS ++#define INIT_TASK_COMM "MuQSS" ++#else + #define INIT_TASK_COMM "swapper" ++#endif + + /* Attach to the init_task data structure for proper alignment */ + #ifdef CONFIG_ARCH_TASK_STRUCT_ON_STACK +diff --git a/include/linux/ioprio.h b/include/linux/ioprio.h +index e9bfe6972aed..16ba1c7e5bde 100644 +--- a/include/linux/ioprio.h ++++ b/include/linux/ioprio.h +@@ -53,6 +53,8 @@ enum { + */ + static inline int task_nice_ioprio(struct task_struct *task) + { ++ if (iso_task(task)) ++ return 0; + return (task_nice(task) + 20) / 5; + } + +diff --git a/include/linux/sched.h b/include/linux/sched.h +index 67a1d86981a9..95b427fdbb2e 100644 +--- a/include/linux/sched.h ++++ b/include/linux/sched.h +@@ -31,6 +31,9 @@ + #include + #include + #include ++#ifdef CONFIG_SCHED_MUQSS ++#include ++#endif + + /* task_struct member predeclarations (sorted alphabetically): */ + struct audit_context; +@@ -214,13 +217,40 @@ struct task_group; + + extern void scheduler_tick(void); + +-#define MAX_SCHEDULE_TIMEOUT LONG_MAX +- ++#define MAX_SCHEDULE_TIMEOUT LONG_MAX + extern long schedule_timeout(long timeout); + extern long schedule_timeout_interruptible(long timeout); + extern long schedule_timeout_killable(long timeout); + extern long schedule_timeout_uninterruptible(long timeout); + extern long schedule_timeout_idle(long timeout); ++ ++#ifdef CONFIG_HIGH_RES_TIMERS ++extern long schedule_msec_hrtimeout(long timeout); ++extern long schedule_min_hrtimeout(void); ++extern long schedule_msec_hrtimeout_interruptible(long timeout); ++extern long schedule_msec_hrtimeout_uninterruptible(long timeout); ++#else ++static inline long schedule_msec_hrtimeout(long timeout) ++{ ++ return schedule_timeout(msecs_to_jiffies(timeout)); ++} ++ ++static inline long schedule_min_hrtimeout(void) ++{ ++ return schedule_timeout(1); ++} ++ ++static inline long schedule_msec_hrtimeout_interruptible(long timeout) ++{ ++ return schedule_timeout_interruptible(msecs_to_jiffies(timeout)); ++} ++ ++static inline long schedule_msec_hrtimeout_uninterruptible(long timeout) ++{ ++ return schedule_timeout_uninterruptible(msecs_to_jiffies(timeout)); ++} ++#endif ++ + asmlinkage void schedule(void); + extern void schedule_preempt_disabled(void); + asmlinkage void preempt_schedule_irq(void); +@@ -644,9 +674,11 @@ struct task_struct { + unsigned int flags; + unsigned int ptrace; + ++#if defined(CONFIG_SMP) || defined(CONFIG_SCHED_MUQSS) ++ int on_cpu; ++#endif + #ifdef CONFIG_SMP + struct llist_node wake_entry; +- int on_cpu; + #ifdef CONFIG_THREAD_INFO_IN_TASK + /* Current CPU: */ + unsigned int cpu; +@@ -671,10 +703,25 @@ struct task_struct { + int static_prio; + int normal_prio; + unsigned int rt_priority; ++#ifdef CONFIG_SCHED_MUQSS ++ int time_slice; ++ u64 deadline; ++ skiplist_node node; /* Skip list node */ ++ u64 last_ran; ++ u64 sched_time; /* sched_clock time spent running */ ++#ifdef CONFIG_SMT_NICE ++ int smt_bias; /* Policy/nice level bias across smt siblings */ ++#endif ++#ifdef CONFIG_HOTPLUG_CPU ++ bool zerobound; /* Bound to CPU0 for hotplug */ ++#endif ++ unsigned long rt_timeout; ++#else /* CONFIG_SCHED_MUQSS */ + + const struct sched_class *sched_class; + struct sched_entity se; + struct sched_rt_entity rt; ++#endif + #ifdef CONFIG_CGROUP_SCHED + struct task_group *sched_task_group; + #endif +@@ -839,6 +886,10 @@ struct task_struct { + #ifdef CONFIG_ARCH_HAS_SCALED_CPUTIME + u64 utimescaled; + u64 stimescaled; ++#endif ++#ifdef CONFIG_SCHED_MUQSS ++ /* Unbanked cpu time */ ++ unsigned long utime_ns, stime_ns; + #endif + u64 gtime; + struct prev_cputime prev_cputime; +@@ -1283,6 +1334,40 @@ struct task_struct { + */ + }; + ++#ifdef CONFIG_SCHED_MUQSS ++#define tsk_seruntime(t) ((t)->sched_time) ++#define tsk_rttimeout(t) ((t)->rt_timeout) ++ ++static inline void tsk_cpus_current(struct task_struct *p) ++{ ++} ++ ++void print_scheduler_version(void); ++ ++static inline bool iso_task(struct task_struct *p) ++{ ++ return (p->policy == SCHED_ISO); ++} ++#else /* CFS */ ++#define tsk_seruntime(t) ((t)->se.sum_exec_runtime) ++#define tsk_rttimeout(t) ((t)->rt.timeout) ++ ++static inline void tsk_cpus_current(struct task_struct *p) ++{ ++ p->nr_cpus_allowed = current->nr_cpus_allowed; ++} ++ ++static inline void print_scheduler_version(void) ++{ ++ printk(KERN_INFO "CFS CPU scheduler.\n"); ++} ++ ++static inline bool iso_task(struct task_struct *p) ++{ ++ return false; ++} ++#endif /* CONFIG_SCHED_MUQSS */ ++ + static inline struct pid *task_pid(struct task_struct *task) + { + return task->thread_pid; +diff --git a/include/linux/sched/deadline.h b/include/linux/sched/deadline.h +index 1aff00b65f3c..73d6319a856a 100644 +--- a/include/linux/sched/deadline.h ++++ b/include/linux/sched/deadline.h +@@ -28,7 +28,16 @@ static inline bool dl_time_before(u64 a, u64 b) + #ifdef CONFIG_SMP + + struct root_domain; ++#ifdef CONFIG_SCHED_MUQSS ++static inline void dl_clear_root_domain(struct root_domain *rd) ++{ ++} ++static inline void dl_add_task_root_domain(struct task_struct *p) ++{ ++} ++#else /* CONFIG_SCHED_MUQSS */ + extern void dl_add_task_root_domain(struct task_struct *p); + extern void dl_clear_root_domain(struct root_domain *rd); ++#endif /* CONFIG_SCHED_MUQSS */ + + #endif /* CONFIG_SMP */ +diff --git a/include/linux/sched/nohz.h b/include/linux/sched/nohz.h +index 1abe91ff6e4a..20ba383562b0 100644 +--- a/include/linux/sched/nohz.h ++++ b/include/linux/sched/nohz.h +@@ -13,7 +13,7 @@ extern int get_nohz_timer_target(void); + static inline void nohz_balance_enter_idle(int cpu) { } + #endif + +-#ifdef CONFIG_NO_HZ_COMMON ++#if defined(CONFIG_NO_HZ_COMMON) && !defined(CONFIG_SCHED_MUQSS) + void calc_load_nohz_start(void); + void calc_load_nohz_stop(void); + #else +diff --git a/include/linux/sched/prio.h b/include/linux/sched/prio.h +index 7d64feafc408..43c9d9e50c09 100644 +--- a/include/linux/sched/prio.h ++++ b/include/linux/sched/prio.h +@@ -20,8 +20,20 @@ + */ + + #define MAX_USER_RT_PRIO 100 ++ ++#ifdef CONFIG_SCHED_MUQSS ++/* Note different MAX_RT_PRIO */ ++#define MAX_RT_PRIO (MAX_USER_RT_PRIO + 1) ++ ++#define ISO_PRIO (MAX_RT_PRIO) ++#define NORMAL_PRIO (MAX_RT_PRIO + 1) ++#define IDLE_PRIO (MAX_RT_PRIO + 2) ++#define PRIO_LIMIT ((IDLE_PRIO) + 1) ++#else /* CONFIG_SCHED_MUQSS */ + #define MAX_RT_PRIO MAX_USER_RT_PRIO + ++#endif /* CONFIG_SCHED_MUQSS */ ++ + #define MAX_PRIO (MAX_RT_PRIO + NICE_WIDTH) + #define DEFAULT_PRIO (MAX_RT_PRIO + NICE_WIDTH / 2) + +diff --git a/include/linux/sched/rt.h b/include/linux/sched/rt.h +index e5af028c08b4..010b2244e0b6 100644 +--- a/include/linux/sched/rt.h ++++ b/include/linux/sched/rt.h +@@ -24,8 +24,10 @@ static inline bool task_is_realtime(struct task_struct *tsk) + + if (policy == SCHED_FIFO || policy == SCHED_RR) + return true; ++#ifndef CONFIG_SCHED_MUQSS + if (policy == SCHED_DEADLINE) + return true; ++#endif + return false; + } + +diff --git a/include/linux/sched/task.h b/include/linux/sched/task.h +index 4b1c3b664f51..a9671b48799c 100644 +--- a/include/linux/sched/task.h ++++ b/include/linux/sched/task.h +@@ -99,7 +99,7 @@ extern long kernel_wait4(pid_t, int __user *, int, struct rusage *); + extern void free_task(struct task_struct *tsk); + + /* sched_exec is called by processes performing an exec */ +-#ifdef CONFIG_SMP ++#if defined(CONFIG_SMP) && !defined(CONFIG_SCHED_MUQSS) + extern void sched_exec(void); + #else + #define sched_exec() {} +diff --git a/include/linux/skip_list.h b/include/linux/skip_list.h +new file mode 100644 +index 000000000000..d4be84ba273b +--- /dev/null ++++ b/include/linux/skip_list.h +@@ -0,0 +1,33 @@ ++#ifndef _LINUX_SKIP_LISTS_H ++#define _LINUX_SKIP_LISTS_H ++typedef u64 keyType; ++typedef void *valueType; ++ ++typedef struct nodeStructure skiplist_node; ++ ++struct nodeStructure { ++ int level; /* Levels in this structure */ ++ keyType key; ++ valueType value; ++ skiplist_node *next[8]; ++ skiplist_node *prev[8]; ++}; ++ ++typedef struct listStructure { ++ int entries; ++ int level; /* Maximum level of the list ++ (1 more than the number of levels in the list) */ ++ skiplist_node *header; /* pointer to header */ ++} skiplist; ++ ++void skiplist_init(skiplist_node *slnode); ++skiplist *new_skiplist(skiplist_node *slnode); ++void free_skiplist(skiplist *l); ++void skiplist_node_init(skiplist_node *node); ++void skiplist_insert(skiplist *l, skiplist_node *node, keyType key, valueType value, unsigned int randseed); ++void skiplist_delete(skiplist *l, skiplist_node *node); ++ ++static inline bool skiplist_node_empty(skiplist_node *node) { ++ return (!node->next[0]); ++} ++#endif /* _LINUX_SKIP_LISTS_H */ +diff --git a/include/uapi/linux/sched.h b/include/uapi/linux/sched.h +index 25b4fa00bad1..c2503cd28025 100644 +--- a/include/uapi/linux/sched.h ++++ b/include/uapi/linux/sched.h +@@ -84,9 +84,16 @@ struct clone_args { + #define SCHED_FIFO 1 + #define SCHED_RR 2 + #define SCHED_BATCH 3 +-/* SCHED_ISO: reserved but not implemented yet */ ++/* SCHED_ISO: Implemented on MuQSS only */ + #define SCHED_IDLE 5 ++#ifdef CONFIG_SCHED_MUQSS ++#define SCHED_ISO 4 ++#define SCHED_IDLEPRIO SCHED_IDLE ++#define SCHED_MAX (SCHED_IDLEPRIO) ++#define SCHED_RANGE(policy) ((policy) <= SCHED_MAX) ++#else /* CONFIG_SCHED_MUQSS */ + #define SCHED_DEADLINE 6 ++#endif /* CONFIG_SCHED_MUQSS */ + + /* Can be ORed in to make sure the process is reverted back to SCHED_NORMAL on fork */ + #define SCHED_RESET_ON_FORK 0x40000000 +diff --git a/init/Kconfig b/init/Kconfig +index b4daad2bac23..da90d33ba4b3 100644 +--- a/init/Kconfig ++++ b/init/Kconfig +@@ -73,6 +73,18 @@ config THREAD_INFO_IN_TASK + + menu "General setup" + ++config SCHED_MUQSS ++ bool "MuQSS cpu scheduler" ++ select HIGH_RES_TIMERS ++ ---help--- ++ The Multiple Queue Skiplist Scheduler for excellent interactivity and ++ responsiveness on the desktop and highly scalable deterministic ++ low latency on any hardware. ++ ++ Say Y here. ++ default y ++ ++ + config BROKEN + bool + +@@ -802,6 +814,7 @@ config NUMA_BALANCING + depends on ARCH_SUPPORTS_NUMA_BALANCING + depends on !ARCH_WANT_NUMA_VARIABLE_LOCALITY + depends on SMP && NUMA && MIGRATION ++ depends on !SCHED_MUQSS + help + This option adds support for automatic NUMA aware memory/task placement. + The mechanism is quite primitive and is based on migrating memory when +@@ -901,9 +914,13 @@ menuconfig CGROUP_SCHED + help + This feature lets CPU scheduler recognize task groups and control CPU + bandwidth allocation to such task groups. It uses cgroups to group +- tasks. ++ tasks. In combination with MuQSS this is purely a STUB to create the ++ files associated with the CPU controller cgroup but most of the ++ controls do nothing. This is useful for working in environments and ++ with applications that will only work if this control group is ++ present. + +-if CGROUP_SCHED ++if CGROUP_SCHED && !SCHED_MUQSS + config FAIR_GROUP_SCHED + bool "Group scheduling for SCHED_OTHER" + depends on CGROUP_SCHED +@@ -1032,6 +1049,7 @@ config CGROUP_DEVICE + + config CGROUP_CPUACCT + bool "Simple CPU accounting controller" ++ depends on !SCHED_MUQSS + help + Provides a simple controller for monitoring the + total CPU consumed by the tasks in a cgroup. +@@ -1150,6 +1168,7 @@ config CHECKPOINT_RESTORE + + config SCHED_AUTOGROUP + bool "Automatic process group scheduling" ++ depends on !SCHED_MUQSS + select CGROUPS + select CGROUP_SCHED + select FAIR_GROUP_SCHED +diff --git a/init/init_task.c b/init/init_task.c +index 9e5cbe5eab7b..5c2bcbf25add 100644 +--- a/init/init_task.c ++++ b/init/init_task.c +@@ -66,9 +66,17 @@ struct task_struct init_task + .stack = init_stack, + .usage = REFCOUNT_INIT(2), + .flags = PF_KTHREAD, ++#ifdef CONFIG_SCHED_MUQSS ++ .prio = NORMAL_PRIO, ++ .static_prio = MAX_PRIO - 20, ++ .normal_prio = NORMAL_PRIO, ++ .deadline = 0, ++ .time_slice = 1000000, ++#else + .prio = MAX_PRIO - 20, + .static_prio = MAX_PRIO - 20, + .normal_prio = MAX_PRIO - 20, ++#endif + .policy = SCHED_NORMAL, + .cpus_ptr = &init_task.cpus_mask, + .cpus_mask = CPU_MASK_ALL, +@@ -78,6 +86,7 @@ struct task_struct init_task + .restart_block = { + .fn = do_no_restart_syscall, + }, ++#ifndef CONFIG_SCHED_MUQSS + .se = { + .group_node = LIST_HEAD_INIT(init_task.se.group_node), + }, +@@ -85,6 +94,7 @@ struct task_struct init_task + .run_list = LIST_HEAD_INIT(init_task.rt.run_list), + .time_slice = RR_TIMESLICE, + }, ++#endif + .tasks = LIST_HEAD_INIT(init_task.tasks), + #ifdef CONFIG_SMP + .pushable_tasks = PLIST_NODE_INIT(init_task.pushable_tasks, MAX_PRIO), +diff --git a/init/main.c b/init/main.c +index 91f6ebb30ef0..22792032de64 100644 +--- a/init/main.c ++++ b/init/main.c +@@ -1124,6 +1124,8 @@ static int __ref kernel_init(void *unused) + + rcu_end_inkernel_boot(); + ++ print_scheduler_version(); ++ + if (ramdisk_execute_command) { + ret = run_init_process(ramdisk_execute_command); + if (!ret) +diff --git a/kernel/Kconfig.MuQSS b/kernel/Kconfig.MuQSS +new file mode 100644 +index 000000000000..a6a58781ef91 +--- /dev/null ++++ b/kernel/Kconfig.MuQSS +@@ -0,0 +1,105 @@ ++choice ++ prompt "CPU scheduler runqueue sharing" ++ default RQ_MC if SCHED_MUQSS ++ default RQ_NONE ++ ++config RQ_NONE ++ bool "No sharing" ++ help ++ This is the default behaviour where the CPU scheduler has one runqueue ++ per CPU, whether it is a physical or logical CPU (hyperthread). ++ ++ This can still be enabled runtime with the boot parameter ++ rqshare=none ++ ++ If unsure, say N. ++ ++config RQ_SMT ++ bool "SMT (hyperthread) siblings" ++ depends on SCHED_SMT && SCHED_MUQSS ++ ++ help ++ With this option enabled, the CPU scheduler will have one runqueue ++ shared by SMT (hyperthread) siblings. As these logical cores share ++ one physical core, sharing the runqueue resource can lead to decreased ++ overhead, lower latency and higher throughput. ++ ++ This can still be enabled runtime with the boot parameter ++ rqshare=smt ++ ++ If unsure, say N. ++ ++config RQ_MC ++ bool "Multicore siblings" ++ depends on SCHED_MC && SCHED_MUQSS ++ help ++ With this option enabled, the CPU scheduler will have one runqueue ++ shared by multicore siblings in addition to any SMT siblings. ++ As these physical cores share caches, sharing the runqueue resource ++ will lead to lower latency, but its effects on overhead and throughput ++ are less predictable. As a general rule, 6 or fewer cores will likely ++ benefit from this, while larger CPUs will only derive a latency ++ benefit. If your workloads are primarily single threaded, this will ++ possibly worsen throughput. If you are only concerned about latency ++ then enable this regardless of how many cores you have. ++ ++ This can still be enabled runtime with the boot parameter ++ rqshare=mc ++ ++ If unsure, say Y. ++ ++config RQ_MC_LLC ++ bool "Multicore siblings (LLC)" ++ depends on SCHED_MC && SCHED_MUQSS ++ help ++ With this option enabled, the CPU scheduler will behave similarly as ++ with "Multicore siblings". ++ This option takes LLC cache into account when scheduling tasks. ++ Option may benefit CPUs with multiple LLC caches, such as Ryzen ++ and Xeon CPUs. ++ ++ This can still be enabled runtime with the boot parameter ++ rqshare=llc ++ ++ If unsure, say N. ++ ++config RQ_SMP ++ bool "Symmetric Multi-Processing" ++ depends on SMP && SCHED_MUQSS ++ help ++ With this option enabled, the CPU scheduler will have one runqueue ++ shared by all physical CPUs unless they are on separate NUMA nodes. ++ As physical CPUs usually do not share resources, sharing the runqueue ++ will normally worsen throughput but improve latency. If you only ++ care about latency enable this. ++ ++ This can still be enabled runtime with the boot parameter ++ rqshare=smp ++ ++ If unsure, say N. ++ ++config RQ_ALL ++ bool "NUMA" ++ depends on SMP && SCHED_MUQSS ++ help ++ With this option enabled, the CPU scheduler will have one runqueue ++ regardless of the architecture configuration, including across NUMA ++ nodes. This can substantially decrease throughput in NUMA ++ configurations, but light NUMA designs will not be dramatically ++ affected. This option should only be chosen if latency is the prime ++ concern. ++ ++ This can still be enabled runtime with the boot parameter ++ rqshare=all ++ ++ If unsure, say N. ++endchoice ++ ++config SHARERQ ++ int ++ default 0 if RQ_NONE ++ default 1 if RQ_SMT ++ default 2 if RQ_MC ++ default 3 if RQ_MC_LLC ++ default 4 if RQ_SMP ++ default 5 if RQ_ALL +diff --git a/kernel/Kconfig.hz b/kernel/Kconfig.hz +index 38ef6d06888e..89ed751ac4e4 100644 +--- a/kernel/Kconfig.hz ++++ b/kernel/Kconfig.hz +@@ -5,7 +5,8 @@ + + choice + prompt "Timer frequency" +- default HZ_250 ++ default HZ_100 if SCHED_MUQSS ++ default HZ_250_NODEF if !SCHED_MUQSS + help + Allows the configuration of the timer frequency. It is customary + to have the timer interrupt run at 1000 Hz but 100 Hz may be more +@@ -20,11 +21,18 @@ choice + config HZ_100 + bool "100 HZ" + help ++ 100 Hz is a suitable choice in combination with MuQSS which does ++ not rely on ticks for rescheduling interrupts, and is not Hz limited ++ for timeouts and sleeps from both the kernel and userspace. ++ This allows us to benefit from the lower overhead and higher ++ throughput of fewer timer ticks. ++ ++ Non-MuQSS kernels: + 100 Hz is a typical choice for servers, SMP and NUMA systems + with lots of processors that may show reduced performance if + too many timer interrupts are occurring. + +- config HZ_250 ++ config HZ_250_NODEF + bool "250 HZ" + help + 250 Hz is a good compromise choice allowing server performance +@@ -32,7 +40,10 @@ choice + on SMP and NUMA systems. If you are going to be using NTSC video + or multimedia, selected 300Hz instead. + +- config HZ_300 ++ 250 Hz is the default choice for the mainline scheduler but not ++ advantageous in combination with MuQSS. ++ ++ config HZ_300_NODEF + bool "300 HZ" + help + 300 Hz is a good compromise choice allowing server performance +@@ -40,7 +51,7 @@ choice + on SMP and NUMA systems and exactly dividing by both PAL and + NTSC frame rates for video and multimedia work. + +- config HZ_1000 ++ config HZ_1000_NODEF + bool "1000 HZ" + help + 1000 Hz is the preferred choice for desktop systems and other +@@ -51,9 +62,9 @@ endchoice + config HZ + int + default 100 if HZ_100 +- default 250 if HZ_250 +- default 300 if HZ_300 +- default 1000 if HZ_1000 ++ default 250 if HZ_250_NODEF ++ default 300 if HZ_300_NODEF ++ default 1000 if HZ_1000_NODEF + + config SCHED_HRTICK + def_bool HIGH_RES_TIMERS +diff --git a/kernel/Kconfig.preempt b/kernel/Kconfig.preempt +index deff97217496..883998dd0437 100644 +--- a/kernel/Kconfig.preempt ++++ b/kernel/Kconfig.preempt +@@ -2,7 +2,7 @@ + + choice + prompt "Preemption Model" +- default PREEMPT_NONE ++ default PREEMPT + + config PREEMPT_NONE + bool "No Forced Preemption (Server)" +@@ -18,7 +18,7 @@ config PREEMPT_NONE + latencies. + + config PREEMPT_VOLUNTARY +- bool "Voluntary Kernel Preemption (Desktop)" ++ bool "Voluntary Kernel Preemption (Nothing)" + depends on !ARCH_NO_PREEMPT + help + This option reduces the latency of the kernel by adding more +@@ -33,7 +33,8 @@ config PREEMPT_VOLUNTARY + applications to run more 'smoothly' even when the system is + under load. + +- Select this if you are building a kernel for a desktop system. ++ Select this for no system in particular (choose Preemptible ++ instead on a desktop if you know what's good for you). + + config PREEMPT + bool "Preemptible Kernel (Low-Latency Desktop)" +diff --git a/kernel/Makefile b/kernel/Makefile +index daad787fb795..9bb44fc4ef5b 100644 +--- a/kernel/Makefile ++++ b/kernel/Makefile +@@ -10,7 +10,7 @@ obj-y = fork.o exec_domain.o panic.o \ + extable.o params.o \ + kthread.o sys_ni.o nsproxy.o \ + notifier.o ksysfs.o cred.o reboot.o \ +- async.o range.o smpboot.o ucount.o ++ async.o range.o smpboot.o ucount.o skip_list.o + + obj-$(CONFIG_MODULES) += kmod.o + obj-$(CONFIG_MULTIUSER) += groups.o +diff --git a/kernel/delayacct.c b/kernel/delayacct.c +index 27725754ac99..769d773c7182 100644 +--- a/kernel/delayacct.c ++++ b/kernel/delayacct.c +@@ -106,7 +106,7 @@ int __delayacct_add_tsk(struct taskstats *d, struct task_struct *tsk) + */ + t1 = tsk->sched_info.pcount; + t2 = tsk->sched_info.run_delay; +- t3 = tsk->se.sum_exec_runtime; ++ t3 = tsk_seruntime(tsk); + + d->cpu_count += t1; + +diff --git a/kernel/exit.c b/kernel/exit.c +index a46a50d67002..58043176b285 100644 +--- a/kernel/exit.c ++++ b/kernel/exit.c +@@ -131,7 +131,7 @@ static void __exit_signal(struct task_struct *tsk) + sig->curr_target = next_thread(tsk); + } + +- add_device_randomness((const void*) &tsk->se.sum_exec_runtime, ++ add_device_randomness((const void*) &tsk_seruntime(tsk), + sizeof(unsigned long long)); + + /* +@@ -152,7 +152,7 @@ static void __exit_signal(struct task_struct *tsk) + sig->inblock += task_io_get_inblock(tsk); + sig->oublock += task_io_get_oublock(tsk); + task_io_accounting_add(&sig->ioac, &tsk->ioac); +- sig->sum_sched_runtime += tsk->se.sum_exec_runtime; ++ sig->sum_sched_runtime += tsk_seruntime(tsk); + sig->nr_threads--; + __unhash_process(tsk, group_dead); + write_sequnlock(&sig->stats_lock); +diff --git a/kernel/irq/Kconfig b/kernel/irq/Kconfig +index f92d9a687372..d17db0ff775f 100644 +--- a/kernel/irq/Kconfig ++++ b/kernel/irq/Kconfig +@@ -111,6 +111,23 @@ config GENERIC_IRQ_RESERVATION_MODE + config IRQ_FORCED_THREADING + bool + ++config FORCE_IRQ_THREADING ++ bool "Make IRQ threading compulsory" ++ depends on IRQ_FORCED_THREADING ++ default n ++ ---help--- ++ ++ Make IRQ threading mandatory for any IRQ handlers that support it ++ instead of being optional and requiring the threadirqs kernel ++ parameter. Instead they can be optionally disabled with the ++ nothreadirqs kernel parameter. ++ ++ Enabling this may make some architectures not boot with runqueue ++ sharing and MuQSS. ++ ++ Enable if you are building for a desktop or low latency system, ++ otherwise say N. ++ + config SPARSE_IRQ + bool "Support sparse irq numbering" if MAY_HAVE_SPARSE_IRQ + ---help--- +diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c +index 1753486b440c..f43423737493 100644 +--- a/kernel/irq/manage.c ++++ b/kernel/irq/manage.c +@@ -24,9 +24,20 @@ + #include "internals.h" + + #if defined(CONFIG_IRQ_FORCED_THREADING) && !defined(CONFIG_PREEMPT_RT) ++#ifdef CONFIG_FORCE_IRQ_THREADING ++__read_mostly bool force_irqthreads = true; ++#else + __read_mostly bool force_irqthreads; ++#endif + EXPORT_SYMBOL_GPL(force_irqthreads); + ++static int __init setup_noforced_irqthreads(char *arg) ++{ ++ force_irqthreads = false; ++ return 0; ++} ++early_param("nothreadirqs", setup_noforced_irqthreads); ++ + static int __init setup_forced_irqthreads(char *arg) + { + force_irqthreads = true; +diff --git a/kernel/kthread.c b/kernel/kthread.c +index b262f47046ca..9797ad652268 100644 +--- a/kernel/kthread.c ++++ b/kernel/kthread.c +@@ -433,6 +433,34 @@ void kthread_bind(struct task_struct *p, unsigned int cpu) + } + EXPORT_SYMBOL(kthread_bind); + ++#if defined(CONFIG_SCHED_MUQSS) && defined(CONFIG_SMP) ++extern void __do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask); ++ ++/* ++ * new_kthread_bind is a special variant of __kthread_bind_mask. ++ * For new threads to work on muqss we want to call do_set_cpus_allowed ++ * without the task_cpu being set and the task rescheduled until they're ++ * rescheduled on their own so we call __do_set_cpus_allowed directly which ++ * only changes the cpumask. This is particularly important for smpboot threads ++ * to work. ++ */ ++static void new_kthread_bind(struct task_struct *p, unsigned int cpu) ++{ ++ unsigned long flags; ++ ++ if (WARN_ON(!wait_task_inactive(p, TASK_UNINTERRUPTIBLE))) ++ return; ++ ++ /* It's safe because the task is inactive. */ ++ raw_spin_lock_irqsave(&p->pi_lock, flags); ++ __do_set_cpus_allowed(p, cpumask_of(cpu)); ++ p->flags |= PF_NO_SETAFFINITY; ++ raw_spin_unlock_irqrestore(&p->pi_lock, flags); ++} ++#else ++#define new_kthread_bind(p, cpu) kthread_bind(p, cpu) ++#endif ++ + /** + * kthread_create_on_cpu - Create a cpu bound kthread + * @threadfn: the function to run until signal_pending(current). +@@ -454,7 +482,7 @@ struct task_struct *kthread_create_on_cpu(int (*threadfn)(void *data), + cpu); + if (IS_ERR(p)) + return p; +- kthread_bind(p, cpu); ++ new_kthread_bind(p, cpu); + /* CPU hotplug need to bind once again when unparking the thread. */ + set_bit(KTHREAD_IS_PER_CPU, &to_kthread(p)->flags); + to_kthread(p)->cpu = cpu; +diff --git a/kernel/livepatch/transition.c b/kernel/livepatch/transition.c +index cdf318d86dd6..304c0c8c2bea 100644 +--- a/kernel/livepatch/transition.c ++++ b/kernel/livepatch/transition.c +@@ -282,7 +282,7 @@ static bool klp_try_switch_task(struct task_struct *task) + { + static char err_buf[STACK_ERR_BUF_SIZE]; + struct rq *rq; +- struct rq_flags flags; ++ struct rq_flags rf; + int ret; + bool success = false; + +@@ -304,7 +304,7 @@ static bool klp_try_switch_task(struct task_struct *task) + * functions. If all goes well, switch the task to the target patch + * state. + */ +- rq = task_rq_lock(task, &flags); ++ rq = task_rq_lock(task, &rf); + + if (task_running(rq, task) && task != current) { + snprintf(err_buf, STACK_ERR_BUF_SIZE, +@@ -323,7 +323,7 @@ static bool klp_try_switch_task(struct task_struct *task) + task->patch_state = klp_target_state; + + done: +- task_rq_unlock(rq, task, &flags); ++ task_rq_unlock(rq, task, &rf); + + /* + * Due to console deadlock issues, pr_debug() can't be used while +diff --git a/kernel/sched/Makefile b/kernel/sched/Makefile +index 21fb5a5662b5..a04ffebc6b7a 100644 +--- a/kernel/sched/Makefile ++++ b/kernel/sched/Makefile +@@ -16,15 +16,23 @@ ifneq ($(CONFIG_SCHED_OMIT_FRAME_POINTER),y) + CFLAGS_core.o := $(PROFILING) -fno-omit-frame-pointer + endif + ++ifdef CONFIG_SCHED_MUQSS ++obj-y += MuQSS.o clock.o cputime.o ++obj-y += idle.o ++obj-y += wait.o wait_bit.o swait.o completion.o ++ ++obj-$(CONFIG_SMP) += topology.o ++else + obj-y += core.o loadavg.o clock.o cputime.o + obj-y += idle.o fair.o rt.o deadline.o + obj-y += wait.o wait_bit.o swait.o completion.o + + obj-$(CONFIG_SMP) += cpupri.o cpudeadline.o topology.o stop_task.o pelt.o + obj-$(CONFIG_SCHED_AUTOGROUP) += autogroup.o +-obj-$(CONFIG_SCHEDSTATS) += stats.o + obj-$(CONFIG_SCHED_DEBUG) += debug.o + obj-$(CONFIG_CGROUP_CPUACCT) += cpuacct.o ++endif ++obj-$(CONFIG_SCHEDSTATS) += stats.o + obj-$(CONFIG_CPU_FREQ) += cpufreq.o + obj-$(CONFIG_CPU_FREQ_GOV_SCHEDUTIL) += cpufreq_schedutil.o + obj-$(CONFIG_MEMBARRIER) += membarrier.o +diff --git a/kernel/sched/MuQSS.c b/kernel/sched/MuQSS.c +new file mode 100644 +index 000000000000..fafb5a790cf1 +--- /dev/null ++++ b/kernel/sched/MuQSS.c +@@ -0,0 +1,7606 @@ ++// SPDX-License-Identifier: GPL-2.0 ++/* ++ * kernel/sched/MuQSS.c, was kernel/sched.c ++ * ++ * Kernel scheduler and related syscalls ++ * ++ * Copyright (C) 1991-2002 Linus Torvalds ++ * ++ * 1996-12-23 Modified by Dave Grothe to fix bugs in semaphores and ++ * make semaphores SMP safe ++ * 1998-11-19 Implemented schedule_timeout() and related stuff ++ * by Andrea Arcangeli ++ * 2002-01-04 New ultra-scalable O(1) scheduler by Ingo Molnar: ++ * hybrid priority-list and round-robin design with ++ * an array-switch method of distributing timeslices ++ * and per-CPU runqueues. Cleanups and useful suggestions ++ * by Davide Libenzi, preemptible kernel bits by Robert Love. ++ * 2003-09-03 Interactivity tuning by Con Kolivas. ++ * 2004-04-02 Scheduler domains code by Nick Piggin ++ * 2007-04-15 Work begun on replacing all interactivity tuning with a ++ * fair scheduling design by Con Kolivas. ++ * 2007-05-05 Load balancing (smp-nice) and other improvements ++ * by Peter Williams ++ * 2007-05-06 Interactivity improvements to CFS by Mike Galbraith ++ * 2007-07-01 Group scheduling enhancements by Srivatsa Vaddagiri ++ * 2007-11-29 RT balancing improvements by Steven Rostedt, Gregory Haskins, ++ * Thomas Gleixner, Mike Kravetz ++ * 2009-08-13 Brainfuck deadline scheduling policy by Con Kolivas deletes ++ * a whole lot of those previous things. ++ * 2016-10-01 Multiple Queue Skiplist Scheduler scalable evolution of BFS ++ * scheduler by Con Kolivas. ++ * 2019-08-31 LLC bits by Eduards Bezverhijs ++ */ ++ ++#include ++#include ++ ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++ ++#include ++#include ++#include ++ ++#include "../workqueue_internal.h" ++#include "../smpboot.h" ++ ++#define CREATE_TRACE_POINTS ++#include ++ ++#include "MuQSS.h" ++ ++#define rt_prio(prio) unlikely((prio) < MAX_RT_PRIO) ++#define rt_task(p) rt_prio((p)->prio) ++#define batch_task(p) (unlikely((p)->policy == SCHED_BATCH)) ++#define is_rt_policy(policy) ((policy) == SCHED_FIFO || \ ++ (policy) == SCHED_RR) ++#define has_rt_policy(p) unlikely(is_rt_policy((p)->policy)) ++ ++#define is_idle_policy(policy) ((policy) == SCHED_IDLEPRIO) ++#define idleprio_task(p) unlikely(is_idle_policy((p)->policy)) ++#define task_running_idle(p) unlikely((p)->prio == IDLE_PRIO) ++ ++#define is_iso_policy(policy) ((policy) == SCHED_ISO) ++#define iso_task(p) unlikely(is_iso_policy((p)->policy)) ++#define task_running_iso(p) unlikely((p)->prio == ISO_PRIO) ++ ++#define rq_idle(rq) ((rq)->rq_prio == PRIO_LIMIT) ++ ++#define ISO_PERIOD (5 * HZ) ++ ++#define STOP_PRIO (MAX_RT_PRIO - 1) ++ ++/* ++ * Some helpers for converting to/from various scales. Use shifts to get ++ * approximate multiples of ten for less overhead. ++ */ ++#define APPROX_NS_PS (1073741824) /* Approximate ns per second */ ++#define JIFFIES_TO_NS(TIME) ((TIME) * (APPROX_NS_PS / HZ)) ++#define JIFFY_NS (APPROX_NS_PS / HZ) ++#define JIFFY_US (1048576 / HZ) ++#define NS_TO_JIFFIES(TIME) ((TIME) / JIFFY_NS) ++#define HALF_JIFFY_NS (APPROX_NS_PS / HZ / 2) ++#define HALF_JIFFY_US (1048576 / HZ / 2) ++#define MS_TO_NS(TIME) ((TIME) << 20) ++#define MS_TO_US(TIME) ((TIME) << 10) ++#define NS_TO_MS(TIME) ((TIME) >> 20) ++#define NS_TO_US(TIME) ((TIME) >> 10) ++#define US_TO_NS(TIME) ((TIME) << 10) ++#define TICK_APPROX_NS ((APPROX_NS_PS+HZ/2)/HZ) ++ ++#define RESCHED_US (100) /* Reschedule if less than this many μs left */ ++ ++void print_scheduler_version(void) ++{ ++ printk(KERN_INFO "MuQSS CPU scheduler v0.196 by Con Kolivas.\n"); ++} ++ ++/* Define RQ share levels */ ++#define RQSHARE_NONE 0 ++#define RQSHARE_SMT 1 ++#define RQSHARE_MC 2 ++#define RQSHARE_MC_LLC 3 ++#define RQSHARE_SMP 4 ++#define RQSHARE_ALL 5 ++ ++/* Define locality levels */ ++#define LOCALITY_SAME 0 ++#define LOCALITY_SMT 1 ++#define LOCALITY_MC_LLC 2 ++#define LOCALITY_MC 3 ++#define LOCALITY_SMP 4 ++#define LOCALITY_DISTANT 5 ++ ++/* ++ * This determines what level of runqueue sharing will be done and is ++ * configurable at boot time with the bootparam rqshare = ++ */ ++static int rqshare __read_mostly = CONFIG_SHARERQ; /* Default RQSHARE_MC */ ++ ++static int __init set_rqshare(char *str) ++{ ++ if (!strncmp(str, "none", 4)) { ++ rqshare = RQSHARE_NONE; ++ return 0; ++ } ++ if (!strncmp(str, "smt", 3)) { ++ rqshare = RQSHARE_SMT; ++ return 0; ++ } ++ if (!strncmp(str, "mc", 2)) { ++ rqshare = RQSHARE_MC; ++ return 0; ++ } ++ if (!strncmp(str, "llc", 3)) { ++ rqshare = RQSHARE_MC_LLC; ++ return 0; ++ } ++ if (!strncmp(str, "smp", 3)) { ++ rqshare = RQSHARE_SMP; ++ return 0; ++ } ++ if (!strncmp(str, "all", 3)) { ++ rqshare = RQSHARE_ALL; ++ return 0; ++ } ++ return 1; ++} ++__setup("rqshare=", set_rqshare); ++ ++/* ++ * This is the time all tasks within the same priority round robin. ++ * Value is in ms and set to a minimum of 6ms. ++ * Tunable via /proc interface. ++ */ ++int rr_interval __read_mostly = 6; ++ ++/* ++ * Tunable to choose whether to prioritise latency or throughput, simple ++ * binary yes or no ++ */ ++int sched_interactive __read_mostly = 1; ++ ++/* ++ * sched_iso_cpu - sysctl which determines the cpu percentage SCHED_ISO tasks ++ * are allowed to run five seconds as real time tasks. This is the total over ++ * all online cpus. ++ */ ++int sched_iso_cpu __read_mostly = 70; ++ ++/* ++ * sched_yield_type - Choose what sort of yield sched_yield will perform. ++ * 0: No yield. ++ * 1: Yield only to better priority/deadline tasks. (default) ++ * 2: Expire timeslice and recalculate deadline. ++ */ ++int sched_yield_type __read_mostly = 1; ++ ++/* ++ * The relative length of deadline for each priority(nice) level. ++ */ ++static int prio_ratios[NICE_WIDTH] __read_mostly; ++ ++ ++/* ++ * The quota handed out to tasks of all priority levels when refilling their ++ * time_slice. ++ */ ++static inline int timeslice(void) ++{ ++ return MS_TO_US(rr_interval); ++} ++ ++DEFINE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues); ++ ++#ifdef CONFIG_SMP ++/* ++ * Total number of runqueues. Equals number of CPUs when there is no runqueue ++ * sharing but is usually less with SMT/MC sharing of runqueues. ++ */ ++static int total_runqueues __read_mostly = 1; ++ ++static cpumask_t cpu_idle_map ____cacheline_aligned_in_smp; ++ ++struct rq *cpu_rq(int cpu) ++{ ++ return &per_cpu(runqueues, (cpu)); ++} ++#define cpu_curr(cpu) (cpu_rq(cpu)->curr) ++ ++/* ++ * For asym packing, by default the lower numbered cpu has higher priority. ++ */ ++int __weak arch_asym_cpu_priority(int cpu) ++{ ++ return -cpu; ++} ++ ++int __weak arch_sd_sibling_asym_packing(void) ++{ ++ return 0*SD_ASYM_PACKING; ++} ++ ++#ifdef CONFIG_SCHED_SMT ++DEFINE_STATIC_KEY_FALSE(sched_smt_present); ++EXPORT_SYMBOL_GPL(sched_smt_present); ++#endif ++ ++#else ++struct rq *uprq; ++#endif /* CONFIG_SMP */ ++ ++#include "stats.h" ++ ++/* ++ * All common locking functions performed on rq->lock. rq->clock is local to ++ * the CPU accessing it so it can be modified just with interrupts disabled ++ * when we're not updating niffies. ++ * Looking up task_rq must be done under rq->lock to be safe. ++ */ ++ ++/* ++ * RQ-clock updating methods: ++ */ ++ ++#ifdef HAVE_SCHED_AVG_IRQ ++static void update_irq_load_avg(struct rq *rq, long delta); ++#else ++static inline void update_irq_load_avg(struct rq *rq, long delta) {} ++#endif ++ ++static void update_rq_clock_task(struct rq *rq, s64 delta) ++{ ++/* ++ * In theory, the compile should just see 0 here, and optimize out the call ++ * to sched_rt_avg_update. But I don't trust it... ++ */ ++ s64 __maybe_unused steal = 0, irq_delta = 0; ++#ifdef CONFIG_IRQ_TIME_ACCOUNTING ++ irq_delta = irq_time_read(cpu_of(rq)) - rq->prev_irq_time; ++ ++ /* ++ * Since irq_time is only updated on {soft,}irq_exit, we might run into ++ * this case when a previous update_rq_clock() happened inside a ++ * {soft,}irq region. ++ * ++ * When this happens, we stop ->clock_task and only update the ++ * prev_irq_time stamp to account for the part that fit, so that a next ++ * update will consume the rest. This ensures ->clock_task is ++ * monotonic. ++ * ++ * It does however cause some slight miss-attribution of {soft,}irq ++ * time, a more accurate solution would be to update the irq_time using ++ * the current rq->clock timestamp, except that would require using ++ * atomic ops. ++ */ ++ if (irq_delta > delta) ++ irq_delta = delta; ++ ++ rq->prev_irq_time += irq_delta; ++ delta -= irq_delta; ++#endif ++#ifdef CONFIG_PARAVIRT_TIME_ACCOUNTING ++ if (static_key_false((¶virt_steal_rq_enabled))) { ++ steal = paravirt_steal_clock(cpu_of(rq)); ++ steal -= rq->prev_steal_time_rq; ++ ++ if (unlikely(steal > delta)) ++ steal = delta; ++ ++ rq->prev_steal_time_rq += steal; ++ delta -= steal; ++ } ++#endif ++ rq->clock_task += delta; ++ ++#ifdef CONFIG_HAVE_SCHED_AVG_IRQ ++ if (irq_delta + steal) ++ update_irq_load_avg(rq, irq_delta + steal); ++#endif ++} ++ ++static inline void update_rq_clock(struct rq *rq) ++{ ++ s64 delta = sched_clock_cpu(cpu_of(rq)) - rq->clock; ++ ++ if (unlikely(delta < 0)) ++ return; ++ rq->clock += delta; ++ update_rq_clock_task(rq, delta); ++} ++ ++/* ++ * Niffies are a globally increasing nanosecond counter. They're only used by ++ * update_load_avg and time_slice_expired, however deadlines are based on them ++ * across CPUs. Update them whenever we will call one of those functions, and ++ * synchronise them across CPUs whenever we hold both runqueue locks. ++ */ ++static inline void update_clocks(struct rq *rq) ++{ ++ s64 ndiff, minndiff; ++ long jdiff; ++ ++ update_rq_clock(rq); ++ ndiff = rq->clock - rq->old_clock; ++ rq->old_clock = rq->clock; ++ jdiff = jiffies - rq->last_jiffy; ++ ++ /* Subtract any niffies added by balancing with other rqs */ ++ ndiff -= rq->niffies - rq->last_niffy; ++ minndiff = JIFFIES_TO_NS(jdiff) - rq->niffies + rq->last_jiffy_niffies; ++ if (minndiff < 0) ++ minndiff = 0; ++ ndiff = max(ndiff, minndiff); ++ rq->niffies += ndiff; ++ rq->last_niffy = rq->niffies; ++ if (jdiff) { ++ rq->last_jiffy += jdiff; ++ rq->last_jiffy_niffies = rq->niffies; ++ } ++} ++ ++/* ++ * Any time we have two runqueues locked we use that as an opportunity to ++ * synchronise niffies to the highest value as idle ticks may have artificially ++ * kept niffies low on one CPU and the truth can only be later. ++ */ ++static inline void synchronise_niffies(struct rq *rq1, struct rq *rq2) ++{ ++ if (rq1->niffies > rq2->niffies) ++ rq2->niffies = rq1->niffies; ++ else ++ rq1->niffies = rq2->niffies; ++} ++ ++/* ++ * double_rq_lock - safely lock two runqueues ++ * ++ * Note this does not disable interrupts like task_rq_lock, ++ * you need to do so manually before calling. ++ */ ++ ++/* For when we know rq1 != rq2 */ ++static inline void __double_rq_lock(struct rq *rq1, struct rq *rq2) ++ __acquires(rq1->lock) ++ __acquires(rq2->lock) ++{ ++ if (rq1 < rq2) { ++ raw_spin_lock(rq1->lock); ++ raw_spin_lock_nested(rq2->lock, SINGLE_DEPTH_NESTING); ++ } else { ++ raw_spin_lock(rq2->lock); ++ raw_spin_lock_nested(rq1->lock, SINGLE_DEPTH_NESTING); ++ } ++} ++ ++static inline void double_rq_lock(struct rq *rq1, struct rq *rq2) ++ __acquires(rq1->lock) ++ __acquires(rq2->lock) ++{ ++ BUG_ON(!irqs_disabled()); ++ if (rq1->lock == rq2->lock) { ++ raw_spin_lock(rq1->lock); ++ __acquire(rq2->lock); /* Fake it out ;) */ ++ } else ++ __double_rq_lock(rq1, rq2); ++ synchronise_niffies(rq1, rq2); ++} ++ ++/* ++ * double_rq_unlock - safely unlock two runqueues ++ * ++ * Note this does not restore interrupts like task_rq_unlock, ++ * you need to do so manually after calling. ++ */ ++static inline void double_rq_unlock(struct rq *rq1, struct rq *rq2) ++ __releases(rq1->lock) ++ __releases(rq2->lock) ++{ ++ raw_spin_unlock(rq1->lock); ++ if (rq1->lock != rq2->lock) ++ raw_spin_unlock(rq2->lock); ++ else ++ __release(rq2->lock); ++} ++ ++static inline void lock_all_rqs(void) ++{ ++ int cpu; ++ ++ preempt_disable(); ++ for_each_possible_cpu(cpu) { ++ struct rq *rq = cpu_rq(cpu); ++ ++ do_raw_spin_lock(rq->lock); ++ } ++} ++ ++static inline void unlock_all_rqs(void) ++{ ++ int cpu; ++ ++ for_each_possible_cpu(cpu) { ++ struct rq *rq = cpu_rq(cpu); ++ ++ do_raw_spin_unlock(rq->lock); ++ } ++ preempt_enable(); ++} ++ ++/* Specially nest trylock an rq */ ++static inline bool trylock_rq(struct rq *this_rq, struct rq *rq) ++{ ++ if (unlikely(!do_raw_spin_trylock(rq->lock))) ++ return false; ++ spin_acquire(&rq->lock->dep_map, SINGLE_DEPTH_NESTING, 1, _RET_IP_); ++ synchronise_niffies(this_rq, rq); ++ return true; ++} ++ ++/* Unlock a specially nested trylocked rq */ ++static inline void unlock_rq(struct rq *rq) ++{ ++ spin_release(&rq->lock->dep_map, 1, _RET_IP_); ++ do_raw_spin_unlock(rq->lock); ++} ++ ++/* ++ * cmpxchg based fetch_or, macro so it works for different integer types ++ */ ++#define fetch_or(ptr, mask) \ ++ ({ \ ++ typeof(ptr) _ptr = (ptr); \ ++ typeof(mask) _mask = (mask); \ ++ typeof(*_ptr) _old, _val = *_ptr; \ ++ \ ++ for (;;) { \ ++ _old = cmpxchg(_ptr, _val, _val | _mask); \ ++ if (_old == _val) \ ++ break; \ ++ _val = _old; \ ++ } \ ++ _old; \ ++}) ++ ++#if defined(CONFIG_SMP) && defined(TIF_POLLING_NRFLAG) ++/* ++ * Atomically set TIF_NEED_RESCHED and test for TIF_POLLING_NRFLAG, ++ * this avoids any races wrt polling state changes and thereby avoids ++ * spurious IPIs. ++ */ ++static bool set_nr_and_not_polling(struct task_struct *p) ++{ ++ struct thread_info *ti = task_thread_info(p); ++ return !(fetch_or(&ti->flags, _TIF_NEED_RESCHED) & _TIF_POLLING_NRFLAG); ++} ++ ++/* ++ * Atomically set TIF_NEED_RESCHED if TIF_POLLING_NRFLAG is set. ++ * ++ * If this returns true, then the idle task promises to call ++ * sched_ttwu_pending() and reschedule soon. ++ */ ++static bool set_nr_if_polling(struct task_struct *p) ++{ ++ struct thread_info *ti = task_thread_info(p); ++ typeof(ti->flags) old, val = READ_ONCE(ti->flags); ++ ++ for (;;) { ++ if (!(val & _TIF_POLLING_NRFLAG)) ++ return false; ++ if (val & _TIF_NEED_RESCHED) ++ return true; ++ old = cmpxchg(&ti->flags, val, val | _TIF_NEED_RESCHED); ++ if (old == val) ++ break; ++ val = old; ++ } ++ return true; ++} ++ ++#else ++static bool set_nr_and_not_polling(struct task_struct *p) ++{ ++ set_tsk_need_resched(p); ++ return true; ++} ++ ++#ifdef CONFIG_SMP ++static bool set_nr_if_polling(struct task_struct *p) ++{ ++ return false; ++} ++#endif ++#endif ++ ++static bool __wake_q_add(struct wake_q_head *head, struct task_struct *task) ++{ ++ struct wake_q_node *node = &task->wake_q; ++ ++ /* ++ * Atomically grab the task, if ->wake_q is !nil already it means ++ * its already queued (either by us or someone else) and will get the ++ * wakeup due to that. ++ * ++ * In order to ensure that a pending wakeup will observe our pending ++ * state, even in the failed case, an explicit smp_mb() must be used. ++ */ ++ smp_mb__before_atomic(); ++ if (unlikely(cmpxchg_relaxed(&node->next, NULL, WAKE_Q_TAIL))) ++ return false; ++ ++ /* ++ * The head is context local, there can be no concurrency. ++ */ ++ *head->lastp = node; ++ head->lastp = &node->next; ++ return true; ++} ++ ++/** ++ * wake_q_add() - queue a wakeup for 'later' waking. ++ * @head: the wake_q_head to add @task to ++ * @task: the task to queue for 'later' wakeup ++ * ++ * Queue a task for later wakeup, most likely by the wake_up_q() call in the ++ * same context, _HOWEVER_ this is not guaranteed, the wakeup can come ++ * instantly. ++ * ++ * This function must be used as-if it were wake_up_process(); IOW the task ++ * must be ready to be woken at this location. ++ */ ++void wake_q_add(struct wake_q_head *head, struct task_struct *task) ++{ ++ if (__wake_q_add(head, task)) ++ get_task_struct(task); ++} ++ ++/** ++ * wake_q_add_safe() - safely queue a wakeup for 'later' waking. ++ * @head: the wake_q_head to add @task to ++ * @task: the task to queue for 'later' wakeup ++ * ++ * Queue a task for later wakeup, most likely by the wake_up_q() call in the ++ * same context, _HOWEVER_ this is not guaranteed, the wakeup can come ++ * instantly. ++ * ++ * This function must be used as-if it were wake_up_process(); IOW the task ++ * must be ready to be woken at this location. ++ * ++ * This function is essentially a task-safe equivalent to wake_q_add(). Callers ++ * that already hold reference to @task can call the 'safe' version and trust ++ * wake_q to do the right thing depending whether or not the @task is already ++ * queued for wakeup. ++ */ ++void wake_q_add_safe(struct wake_q_head *head, struct task_struct *task) ++{ ++ if (!__wake_q_add(head, task)) ++ put_task_struct(task); ++} ++ ++void wake_up_q(struct wake_q_head *head) ++{ ++ struct wake_q_node *node = head->first; ++ ++ while (node != WAKE_Q_TAIL) { ++ struct task_struct *task; ++ ++ task = container_of(node, struct task_struct, wake_q); ++ BUG_ON(!task); ++ /* Task can safely be re-inserted now */ ++ node = node->next; ++ task->wake_q.next = NULL; ++ ++ /* ++ * wake_up_process() executes a full barrier, which pairs with ++ * the queueing in wake_q_add() so as not to miss wakeups. ++ */ ++ wake_up_process(task); ++ put_task_struct(task); ++ } ++} ++ ++static inline void smp_sched_reschedule(int cpu) ++{ ++ if (likely(cpu_online(cpu))) ++ smp_send_reschedule(cpu); ++} ++ ++/* ++ * resched_task - mark a task 'to be rescheduled now'. ++ * ++ * On UP this means the setting of the need_resched flag, on SMP it ++ * might also involve a cross-CPU call to trigger the scheduler on ++ * the target CPU. ++ */ ++void resched_task(struct task_struct *p) ++{ ++ int cpu; ++#ifdef CONFIG_LOCKDEP ++ /* Kernel threads call this when creating workqueues while still ++ * inactive from __kthread_bind_mask, holding only the pi_lock */ ++ if (!(p->flags & PF_KTHREAD)) { ++ struct rq *rq = task_rq(p); ++ ++ lockdep_assert_held(rq->lock); ++ } ++#endif ++ if (test_tsk_need_resched(p)) ++ return; ++ ++ cpu = task_cpu(p); ++ if (cpu == smp_processor_id()) { ++ set_tsk_need_resched(p); ++ set_preempt_need_resched(); ++ return; ++ } ++ ++ if (set_nr_and_not_polling(p)) ++ smp_sched_reschedule(cpu); ++ else ++ trace_sched_wake_idle_without_ipi(cpu); ++} ++ ++/* ++ * A task that is not running or queued will not have a node set. ++ * A task that is queued but not running will have a node set. ++ * A task that is currently running will have ->on_cpu set but no node set. ++ */ ++static inline bool task_queued(struct task_struct *p) ++{ ++ return !skiplist_node_empty(&p->node); ++} ++ ++static void enqueue_task(struct rq *rq, struct task_struct *p, int flags); ++static inline void resched_if_idle(struct rq *rq); ++ ++/* Dodgy workaround till we figure out where the softirqs are going */ ++static inline void do_pending_softirq(struct rq *rq, struct task_struct *next) ++{ ++ if (unlikely(next == rq->idle && local_softirq_pending() && !in_interrupt())) ++ do_softirq_own_stack(); ++} ++ ++static inline bool deadline_before(u64 deadline, u64 time) ++{ ++ return (deadline < time); ++} ++ ++/* ++ * Deadline is "now" in niffies + (offset by priority). Setting the deadline ++ * is the key to everything. It distributes cpu fairly amongst tasks of the ++ * same nice value, it proportions cpu according to nice level, it means the ++ * task that last woke up the longest ago has the earliest deadline, thus ++ * ensuring that interactive tasks get low latency on wake up. The CPU ++ * proportion works out to the square of the virtual deadline difference, so ++ * this equation will give nice 19 3% CPU compared to nice 0. ++ */ ++static inline u64 prio_deadline_diff(int user_prio) ++{ ++ return (prio_ratios[user_prio] * rr_interval * (MS_TO_NS(1) / 128)); ++} ++ ++static inline u64 task_deadline_diff(struct task_struct *p) ++{ ++ return prio_deadline_diff(TASK_USER_PRIO(p)); ++} ++ ++static inline u64 static_deadline_diff(int static_prio) ++{ ++ return prio_deadline_diff(USER_PRIO(static_prio)); ++} ++ ++static inline int longest_deadline_diff(void) ++{ ++ return prio_deadline_diff(39); ++} ++ ++static inline int ms_longest_deadline_diff(void) ++{ ++ return NS_TO_MS(longest_deadline_diff()); ++} ++ ++static inline bool rq_local(struct rq *rq); ++ ++#ifndef SCHED_CAPACITY_SCALE ++#define SCHED_CAPACITY_SCALE 1024 ++#endif ++ ++static inline int rq_load(struct rq *rq) ++{ ++ return rq->nr_running; ++} ++ ++/* ++ * Update the load average for feeding into cpu frequency governors. Use a ++ * rough estimate of a rolling average with ~ time constant of 32ms. ++ * 80/128 ~ 0.63. * 80 / 32768 / 128 == * 5 / 262144 ++ * Make sure a call to update_clocks has been made before calling this to get ++ * an updated rq->niffies. ++ */ ++static void update_load_avg(struct rq *rq, unsigned int flags) ++{ ++ long us_interval, load; ++ unsigned long curload; ++ ++ us_interval = NS_TO_US(rq->niffies - rq->load_update); ++ if (unlikely(us_interval <= 0)) ++ return; ++ ++ curload = rq_load(rq); ++ load = rq->load_avg - (rq->load_avg * us_interval * 5 / 262144); ++ if (unlikely(load < 0)) ++ load = 0; ++ load += curload * curload * SCHED_CAPACITY_SCALE * us_interval * 5 / 262144; ++ rq->load_avg = load; ++ ++ rq->load_update = rq->niffies; ++ update_irq_load_avg(rq, 0); ++ if (likely(rq_local(rq))) ++ cpufreq_trigger(rq, flags); ++} ++ ++#ifdef HAVE_SCHED_AVG_IRQ ++/* ++ * IRQ variant of update_load_avg below. delta is actually time in nanoseconds ++ * here so we scale curload to how long it's been since the last update. ++ */ ++static void update_irq_load_avg(struct rq *rq, long delta) ++{ ++ long us_interval, load; ++ unsigned long curload; ++ ++ us_interval = NS_TO_US(rq->niffies - rq->irq_load_update); ++ if (unlikely(us_interval <= 0)) ++ return; ++ ++ curload = NS_TO_US(delta) / us_interval; ++ load = rq->irq_load_avg - (rq->irq_load_avg * us_interval * 5 / 262144); ++ if (unlikely(load < 0)) ++ load = 0; ++ load += curload * curload * SCHED_CAPACITY_SCALE * us_interval * 5 / 262144; ++ rq->irq_load_avg = load; ++ ++ rq->irq_load_update = rq->niffies; ++} ++#endif ++ ++/* ++ * Removing from the runqueue. Enter with rq locked. Deleting a task ++ * from the skip list is done via the stored node reference in the task struct ++ * and does not require a full look up. Thus it occurs in O(k) time where k ++ * is the "level" of the list the task was stored at - usually < 4, max 8. ++ */ ++static void dequeue_task(struct rq *rq, struct task_struct *p, int flags) ++{ ++ skiplist_delete(rq->sl, &p->node); ++ rq->best_key = rq->node->next[0]->key; ++ update_clocks(rq); ++ ++ if (!(flags & DEQUEUE_SAVE)) { ++ sched_info_dequeued(rq, p); ++ psi_dequeue(p, flags & DEQUEUE_SLEEP); ++ } ++ rq->nr_running--; ++ if (rt_task(p)) ++ rq->rt_nr_running--; ++ update_load_avg(rq, flags); ++} ++ ++#ifdef CONFIG_PREEMPT_RCU ++static bool rcu_read_critical(struct task_struct *p) ++{ ++ return p->rcu_read_unlock_special.b.blocked; ++} ++#else /* CONFIG_PREEMPT_RCU */ ++#define rcu_read_critical(p) (false) ++#endif /* CONFIG_PREEMPT_RCU */ ++ ++/* ++ * To determine if it's safe for a task of SCHED_IDLEPRIO to actually run as ++ * an idle task, we ensure none of the following conditions are met. ++ */ ++static bool idleprio_suitable(struct task_struct *p) ++{ ++ return (!(task_contributes_to_load(p)) && !(p->flags & (PF_EXITING)) && ++ !signal_pending(p) && !rcu_read_critical(p) && !freezing(p)); ++} ++ ++/* ++ * To determine if a task of SCHED_ISO can run in pseudo-realtime, we check ++ * that the iso_refractory flag is not set. ++ */ ++static inline bool isoprio_suitable(struct rq *rq) ++{ ++ return !rq->iso_refractory; ++} ++ ++/* ++ * Adding to the runqueue. Enter with rq locked. ++ */ ++static void enqueue_task(struct rq *rq, struct task_struct *p, int flags) ++{ ++ unsigned int randseed, cflags = 0; ++ u64 sl_id; ++ ++ if (!rt_task(p)) { ++ /* Check it hasn't gotten rt from PI */ ++ if ((idleprio_task(p) && idleprio_suitable(p)) || ++ (iso_task(p) && isoprio_suitable(rq))) ++ p->prio = p->normal_prio; ++ else ++ p->prio = NORMAL_PRIO; ++ } else ++ rq->rt_nr_running++; ++ /* ++ * The sl_id key passed to the skiplist generates a sorted list. ++ * Realtime and sched iso tasks run FIFO so they only need be sorted ++ * according to priority. The skiplist will put tasks of the same ++ * key inserted later in FIFO order. Tasks of sched normal, batch ++ * and idleprio are sorted according to their deadlines. Idleprio ++ * tasks are offset by an impossibly large deadline value ensuring ++ * they get sorted into last positions, but still according to their ++ * own deadlines. This creates a "landscape" of skiplists running ++ * from priority 0 realtime in first place to the lowest priority ++ * idleprio tasks last. Skiplist insertion is an O(log n) process. ++ */ ++ if (p->prio <= ISO_PRIO) { ++ sl_id = p->prio; ++ } else { ++ sl_id = p->deadline; ++ if (idleprio_task(p)) { ++ if (p->prio == IDLE_PRIO) ++ sl_id |= 0xF000000000000000; ++ else ++ sl_id += longest_deadline_diff(); ++ } ++ } ++ /* ++ * Some architectures don't have better than microsecond resolution ++ * so mask out ~microseconds as the random seed for skiplist insertion. ++ */ ++ update_clocks(rq); ++ if (!(flags & ENQUEUE_RESTORE)) { ++ sched_info_queued(rq, p); ++ psi_enqueue(p, flags & ENQUEUE_WAKEUP); ++ } ++ ++ randseed = (rq->niffies >> 10) & 0xFFFFFFFF; ++ skiplist_insert(rq->sl, &p->node, sl_id, p, randseed); ++ rq->best_key = rq->node->next[0]->key; ++ if (p->in_iowait) ++ cflags |= SCHED_CPUFREQ_IOWAIT; ++ rq->nr_running++; ++ update_load_avg(rq, cflags); ++} ++ ++/* ++ * Returns the relative length of deadline all compared to the shortest ++ * deadline which is that of nice -20. ++ */ ++static inline int task_prio_ratio(struct task_struct *p) ++{ ++ return prio_ratios[TASK_USER_PRIO(p)]; ++} ++ ++/* ++ * task_timeslice - all tasks of all priorities get the exact same timeslice ++ * length. CPU distribution is handled by giving different deadlines to ++ * tasks of different priorities. Use 128 as the base value for fast shifts. ++ */ ++static inline int task_timeslice(struct task_struct *p) ++{ ++ return (rr_interval * task_prio_ratio(p) / 128); ++} ++ ++#ifdef CONFIG_SMP ++/* Entered with rq locked */ ++static inline void resched_if_idle(struct rq *rq) ++{ ++ if (rq_idle(rq)) ++ resched_task(rq->curr); ++} ++ ++static inline bool rq_local(struct rq *rq) ++{ ++ return (rq->cpu == smp_processor_id()); ++} ++#ifdef CONFIG_SMT_NICE ++static const cpumask_t *thread_cpumask(int cpu); ++ ++/* Find the best real time priority running on any SMT siblings of cpu and if ++ * none are running, the static priority of the best deadline task running. ++ * The lookups to the other runqueues is done lockless as the occasional wrong ++ * value would be harmless. */ ++static int best_smt_bias(struct rq *this_rq) ++{ ++ int other_cpu, best_bias = 0; ++ ++ for_each_cpu(other_cpu, &this_rq->thread_mask) { ++ struct rq *rq = cpu_rq(other_cpu); ++ ++ if (rq_idle(rq)) ++ continue; ++ if (unlikely(!rq->online)) ++ continue; ++ if (!rq->rq_mm) ++ continue; ++ if (likely(rq->rq_smt_bias > best_bias)) ++ best_bias = rq->rq_smt_bias; ++ } ++ return best_bias; ++} ++ ++static int task_prio_bias(struct task_struct *p) ++{ ++ if (rt_task(p)) ++ return 1 << 30; ++ else if (task_running_iso(p)) ++ return 1 << 29; ++ else if (task_running_idle(p)) ++ return 0; ++ return MAX_PRIO - p->static_prio; ++} ++ ++static bool smt_always_schedule(struct task_struct __maybe_unused *p, struct rq __maybe_unused *this_rq) ++{ ++ return true; ++} ++ ++static bool (*smt_schedule)(struct task_struct *p, struct rq *this_rq) = &smt_always_schedule; ++ ++/* We've already decided p can run on CPU, now test if it shouldn't for SMT ++ * nice reasons. */ ++static bool smt_should_schedule(struct task_struct *p, struct rq *this_rq) ++{ ++ int best_bias, task_bias; ++ ++ /* Kernel threads always run */ ++ if (unlikely(!p->mm)) ++ return true; ++ if (rt_task(p)) ++ return true; ++ if (!idleprio_suitable(p)) ++ return true; ++ best_bias = best_smt_bias(this_rq); ++ /* The smt siblings are all idle or running IDLEPRIO */ ++ if (best_bias < 1) ++ return true; ++ task_bias = task_prio_bias(p); ++ if (task_bias < 1) ++ return false; ++ if (task_bias >= best_bias) ++ return true; ++ /* Dither 25% cpu of normal tasks regardless of nice difference */ ++ if (best_bias % 4 == 1) ++ return true; ++ /* Sorry, you lose */ ++ return false; ++} ++#else /* CONFIG_SMT_NICE */ ++#define smt_schedule(p, this_rq) (true) ++#endif /* CONFIG_SMT_NICE */ ++ ++static inline void atomic_set_cpu(int cpu, cpumask_t *cpumask) ++{ ++ set_bit(cpu, (volatile unsigned long *)cpumask); ++} ++ ++/* ++ * The cpu_idle_map stores a bitmap of all the CPUs currently idle to ++ * allow easy lookup of whether any suitable idle CPUs are available. ++ * It's cheaper to maintain a binary yes/no if there are any idle CPUs on the ++ * idle_cpus variable than to do a full bitmask check when we are busy. The ++ * bits are set atomically but read locklessly as occasional false positive / ++ * negative is harmless. ++ */ ++static inline void set_cpuidle_map(int cpu) ++{ ++ if (likely(cpu_online(cpu))) ++ atomic_set_cpu(cpu, &cpu_idle_map); ++} ++ ++static inline void atomic_clear_cpu(int cpu, cpumask_t *cpumask) ++{ ++ clear_bit(cpu, (volatile unsigned long *)cpumask); ++} ++ ++static inline void clear_cpuidle_map(int cpu) ++{ ++ atomic_clear_cpu(cpu, &cpu_idle_map); ++} ++ ++static bool suitable_idle_cpus(struct task_struct *p) ++{ ++ return (cpumask_intersects(p->cpus_ptr, &cpu_idle_map)); ++} ++ ++/* ++ * Resched current on rq. We don't know if rq is local to this CPU nor if it ++ * is locked so we do not use an intermediate variable for the task to avoid ++ * having it dereferenced. ++ */ ++static void resched_curr(struct rq *rq) ++{ ++ int cpu; ++ ++ if (test_tsk_need_resched(rq->curr)) ++ return; ++ ++ rq->preempt = rq->curr; ++ cpu = rq->cpu; ++ ++ /* We're doing this without holding the rq lock if it's not task_rq */ ++ ++ if (cpu == smp_processor_id()) { ++ set_tsk_need_resched(rq->curr); ++ set_preempt_need_resched(); ++ return; ++ } ++ ++ if (set_nr_and_not_polling(rq->curr)) ++ smp_sched_reschedule(cpu); ++ else ++ trace_sched_wake_idle_without_ipi(cpu); ++} ++ ++#define CPUIDLE_DIFF_THREAD (1) ++#define CPUIDLE_DIFF_CORE_LLC (2) ++#define CPUIDLE_DIFF_CORE (4) ++#define CPUIDLE_CACHE_BUSY (8) ++#define CPUIDLE_DIFF_CPU (16) ++#define CPUIDLE_THREAD_BUSY (32) ++#define CPUIDLE_DIFF_NODE (64) ++ ++/* ++ * The best idle CPU is chosen according to the CPUIDLE ranking above where the ++ * lowest value would give the most suitable CPU to schedule p onto next. The ++ * order works out to be the following: ++ * ++ * Same thread, idle or busy cache, idle or busy threads ++ * Other core, same cache, idle or busy cache, idle threads. ++ * Same node, other CPU, idle cache, idle threads. ++ * Same node, other CPU, busy cache, idle threads. ++ * Other core, same cache, busy threads. ++ * Same node, other CPU, busy threads. ++ * Other node, other CPU, idle cache, idle threads. ++ * Other node, other CPU, busy cache, idle threads. ++ * Other node, other CPU, busy threads. ++ */ ++static int best_mask_cpu(int best_cpu, struct rq *rq, cpumask_t *tmpmask) ++{ ++ int best_ranking = CPUIDLE_DIFF_NODE | CPUIDLE_THREAD_BUSY | ++ CPUIDLE_DIFF_CPU | CPUIDLE_CACHE_BUSY | CPUIDLE_DIFF_CORE | ++ CPUIDLE_DIFF_CORE_LLC | CPUIDLE_DIFF_THREAD; ++ int cpu_tmp; ++ ++ if (cpumask_test_cpu(best_cpu, tmpmask)) ++ goto out; ++ ++ for_each_cpu(cpu_tmp, tmpmask) { ++ int ranking, locality; ++ struct rq *tmp_rq; ++ ++ ranking = 0; ++ tmp_rq = cpu_rq(cpu_tmp); ++ ++ locality = rq->cpu_locality[cpu_tmp]; ++#ifdef CONFIG_NUMA ++ if (locality > LOCALITY_SMP) ++ ranking |= CPUIDLE_DIFF_NODE; ++ else ++#endif ++ if (locality > LOCALITY_MC) ++ ranking |= CPUIDLE_DIFF_CPU; ++#ifdef CONFIG_SCHED_MC ++ else if (locality == LOCALITY_MC_LLC) ++ ranking |= CPUIDLE_DIFF_CORE_LLC; ++ else if (locality == LOCALITY_MC) ++ ranking |= CPUIDLE_DIFF_CORE; ++ if (!(tmp_rq->cache_idle(tmp_rq))) ++ ranking |= CPUIDLE_CACHE_BUSY; ++#endif ++#ifdef CONFIG_SCHED_SMT ++ if (locality == LOCALITY_SMT) ++ ranking |= CPUIDLE_DIFF_THREAD; ++#endif ++ if (ranking < best_ranking ++#ifdef CONFIG_SCHED_SMT ++ || (ranking == best_ranking && (tmp_rq->siblings_idle(tmp_rq))) ++#endif ++ ) { ++ best_cpu = cpu_tmp; ++ best_ranking = ranking; ++ } ++ } ++out: ++ return best_cpu; ++} ++ ++bool cpus_share_cache(int this_cpu, int that_cpu) ++{ ++ struct rq *this_rq = cpu_rq(this_cpu); ++ ++ return (this_rq->cpu_locality[that_cpu] < LOCALITY_SMP); ++} ++ ++/* As per resched_curr but only will resched idle task */ ++static inline void resched_idle(struct rq *rq) ++{ ++ if (test_tsk_need_resched(rq->idle)) ++ return; ++ ++ rq->preempt = rq->idle; ++ ++ set_tsk_need_resched(rq->idle); ++ ++ if (rq_local(rq)) { ++ set_preempt_need_resched(); ++ return; ++ } ++ ++ smp_sched_reschedule(rq->cpu); ++} ++ ++static struct rq *resched_best_idle(struct task_struct *p, int cpu) ++{ ++ cpumask_t tmpmask; ++ struct rq *rq; ++ int best_cpu; ++ ++ cpumask_and(&tmpmask, p->cpus_ptr, &cpu_idle_map); ++ best_cpu = best_mask_cpu(cpu, task_rq(p), &tmpmask); ++ rq = cpu_rq(best_cpu); ++ if (!smt_schedule(p, rq)) ++ return NULL; ++ rq->preempt = p; ++ resched_idle(rq); ++ return rq; ++} ++ ++static inline void resched_suitable_idle(struct task_struct *p) ++{ ++ if (suitable_idle_cpus(p)) ++ resched_best_idle(p, task_cpu(p)); ++} ++ ++static inline struct rq *rq_order(struct rq *rq, int cpu) ++{ ++ return rq->rq_order[cpu]; ++} ++#else /* CONFIG_SMP */ ++static inline void set_cpuidle_map(int cpu) ++{ ++} ++ ++static inline void clear_cpuidle_map(int cpu) ++{ ++} ++ ++static inline bool suitable_idle_cpus(struct task_struct *p) ++{ ++ return uprq->curr == uprq->idle; ++} ++ ++static inline void resched_suitable_idle(struct task_struct *p) ++{ ++} ++ ++static inline void resched_curr(struct rq *rq) ++{ ++ resched_task(rq->curr); ++} ++ ++static inline void resched_if_idle(struct rq *rq) ++{ ++} ++ ++static inline bool rq_local(struct rq *rq) ++{ ++ return true; ++} ++ ++static inline struct rq *rq_order(struct rq *rq, int cpu) ++{ ++ return rq; ++} ++ ++static inline bool smt_schedule(struct task_struct *p, struct rq *rq) ++{ ++ return true; ++} ++#endif /* CONFIG_SMP */ ++ ++static inline int normal_prio(struct task_struct *p) ++{ ++ if (has_rt_policy(p)) ++ return MAX_RT_PRIO - 1 - p->rt_priority; ++ if (idleprio_task(p)) ++ return IDLE_PRIO; ++ if (iso_task(p)) ++ return ISO_PRIO; ++ return NORMAL_PRIO; ++} ++ ++/* ++ * Calculate the current priority, i.e. the priority ++ * taken into account by the scheduler. This value might ++ * be boosted by RT tasks as it will be RT if the task got ++ * RT-boosted. If not then it returns p->normal_prio. ++ */ ++static int effective_prio(struct task_struct *p) ++{ ++ p->normal_prio = normal_prio(p); ++ /* ++ * If we are RT tasks or we were boosted to RT priority, ++ * keep the priority unchanged. Otherwise, update priority ++ * to the normal priority: ++ */ ++ if (!rt_prio(p->prio)) ++ return p->normal_prio; ++ return p->prio; ++} ++ ++/* ++ * activate_task - move a task to the runqueue. Enter with rq locked. ++ */ ++static void activate_task(struct rq *rq, struct task_struct *p, int flags) ++{ ++ resched_if_idle(rq); ++ ++ /* ++ * Sleep time is in units of nanosecs, so shift by 20 to get a ++ * milliseconds-range estimation of the amount of time that the task ++ * spent sleeping: ++ */ ++ if (unlikely(prof_on == SLEEP_PROFILING)) { ++ if (p->state == TASK_UNINTERRUPTIBLE) ++ profile_hits(SLEEP_PROFILING, (void *)get_wchan(p), ++ (rq->niffies - p->last_ran) >> 20); ++ } ++ ++ p->prio = effective_prio(p); ++ if (task_contributes_to_load(p)) ++ rq->nr_uninterruptible--; ++ ++ enqueue_task(rq, p, flags); ++ p->on_rq = TASK_ON_RQ_QUEUED; ++} ++ ++/* ++ * deactivate_task - If it's running, it's not on the runqueue and we can just ++ * decrement the nr_running. Enter with rq locked. ++ */ ++static inline void deactivate_task(struct task_struct *p, struct rq *rq) ++{ ++ if (task_contributes_to_load(p)) ++ rq->nr_uninterruptible++; ++ ++ p->on_rq = 0; ++ sched_info_dequeued(rq, p); ++ /* deactivate_task is always DEQUEUE_SLEEP in muqss */ ++ psi_dequeue(p, DEQUEUE_SLEEP); ++} ++ ++#ifdef CONFIG_SMP ++void set_task_cpu(struct task_struct *p, unsigned int new_cpu) ++{ ++ struct rq *rq; ++ ++ if (task_cpu(p) == new_cpu) ++ return; ++ ++ /* Do NOT call set_task_cpu on a currently queued task as we will not ++ * be reliably holding the rq lock after changing CPU. */ ++ BUG_ON(task_queued(p)); ++ rq = task_rq(p); ++ ++#ifdef CONFIG_LOCKDEP ++ /* ++ * The caller should hold either p->pi_lock or rq->lock, when changing ++ * a task's CPU. ->pi_lock for waking tasks, rq->lock for runnable tasks. ++ * ++ * Furthermore, all task_rq users should acquire both locks, see ++ * task_rq_lock(). ++ */ ++ WARN_ON_ONCE(debug_locks && !(lockdep_is_held(&p->pi_lock) || ++ lockdep_is_held(rq->lock))); ++#endif ++ ++ trace_sched_migrate_task(p, new_cpu); ++ rseq_migrate(p); ++ perf_event_task_migrate(p); ++ ++ /* ++ * After ->cpu is set up to a new value, task_rq_lock(p, ...) can be ++ * successfully executed on another CPU. We must ensure that updates of ++ * per-task data have been completed by this moment. ++ */ ++ smp_wmb(); ++ ++ p->wake_cpu = new_cpu; ++ ++ if (task_running(rq, p)) { ++ /* ++ * We should only be calling this on a running task if we're ++ * holding rq lock. ++ */ ++ lockdep_assert_held(rq->lock); ++ ++ /* ++ * We can't change the task_thread_info CPU on a running task ++ * as p will still be protected by the rq lock of the CPU it ++ * is still running on so we only set the wake_cpu for it to be ++ * lazily updated once off the CPU. ++ */ ++ return; ++ } ++ ++#ifdef CONFIG_THREAD_INFO_IN_TASK ++ WRITE_ONCE(p->cpu, new_cpu); ++#else ++ WRITE_ONCE(task_thread_info(p)->cpu, new_cpu); ++#endif ++ /* We're no longer protecting p after this point since we're holding ++ * the wrong runqueue lock. */ ++} ++#endif /* CONFIG_SMP */ ++ ++/* ++ * Move a task off the runqueue and take it to a cpu for it will ++ * become the running task. ++ */ ++static inline void take_task(struct rq *rq, int cpu, struct task_struct *p) ++{ ++ struct rq *p_rq = task_rq(p); ++ ++ dequeue_task(p_rq, p, DEQUEUE_SAVE); ++ if (p_rq != rq) { ++ sched_info_dequeued(p_rq, p); ++ sched_info_queued(rq, p); ++ } ++ set_task_cpu(p, cpu); ++} ++ ++/* ++ * Returns a descheduling task to the runqueue unless it is being ++ * deactivated. ++ */ ++static inline void return_task(struct task_struct *p, struct rq *rq, ++ int cpu, bool deactivate) ++{ ++ if (deactivate) ++ deactivate_task(p, rq); ++ else { ++#ifdef CONFIG_SMP ++ /* ++ * set_task_cpu was called on the running task that doesn't ++ * want to deactivate so it has to be enqueued to a different ++ * CPU and we need its lock. Tag it to be moved with as the ++ * lock is dropped in finish_lock_switch. ++ */ ++ if (unlikely(p->wake_cpu != cpu)) ++ WRITE_ONCE(p->on_rq, TASK_ON_RQ_MIGRATING); ++ else ++#endif ++ enqueue_task(rq, p, ENQUEUE_RESTORE); ++ } ++} ++ ++/* Enter with rq lock held. We know p is on the local cpu */ ++static inline void __set_tsk_resched(struct task_struct *p) ++{ ++ set_tsk_need_resched(p); ++ set_preempt_need_resched(); ++} ++ ++/** ++ * task_curr - is this task currently executing on a CPU? ++ * @p: the task in question. ++ * ++ * Return: 1 if the task is currently executing. 0 otherwise. ++ */ ++inline int task_curr(const struct task_struct *p) ++{ ++ return cpu_curr(task_cpu(p)) == p; ++} ++ ++#ifdef CONFIG_SMP ++/* ++ * wait_task_inactive - wait for a thread to unschedule. ++ * ++ * If @match_state is nonzero, it's the @p->state value just checked and ++ * not expected to change. If it changes, i.e. @p might have woken up, ++ * then return zero. When we succeed in waiting for @p to be off its CPU, ++ * we return a positive number (its total switch count). If a second call ++ * a short while later returns the same number, the caller can be sure that ++ * @p has remained unscheduled the whole time. ++ * ++ * The caller must ensure that the task *will* unschedule sometime soon, ++ * else this function might spin for a *long* time. This function can't ++ * be called with interrupts off, or it may introduce deadlock with ++ * smp_call_function() if an IPI is sent by the same process we are ++ * waiting to become inactive. ++ */ ++unsigned long wait_task_inactive(struct task_struct *p, long match_state) ++{ ++ int running, queued; ++ struct rq_flags rf; ++ unsigned long ncsw; ++ struct rq *rq; ++ ++ for (;;) { ++ rq = task_rq(p); ++ ++ /* ++ * If the task is actively running on another CPU ++ * still, just relax and busy-wait without holding ++ * any locks. ++ * ++ * NOTE! Since we don't hold any locks, it's not ++ * even sure that "rq" stays as the right runqueue! ++ * But we don't care, since this will return false ++ * if the runqueue has changed and p is actually now ++ * running somewhere else! ++ */ ++ while (task_running(rq, p)) { ++ if (match_state && unlikely(p->state != match_state)) ++ return 0; ++ cpu_relax(); ++ } ++ ++ /* ++ * Ok, time to look more closely! We need the rq ++ * lock now, to be *sure*. If we're wrong, we'll ++ * just go back and repeat. ++ */ ++ rq = task_rq_lock(p, &rf); ++ trace_sched_wait_task(p); ++ running = task_running(rq, p); ++ queued = task_on_rq_queued(p); ++ ncsw = 0; ++ if (!match_state || p->state == match_state) ++ ncsw = p->nvcsw | LONG_MIN; /* sets MSB */ ++ task_rq_unlock(rq, p, &rf); ++ ++ /* ++ * If it changed from the expected state, bail out now. ++ */ ++ if (unlikely(!ncsw)) ++ break; ++ ++ /* ++ * Was it really running after all now that we ++ * checked with the proper locks actually held? ++ * ++ * Oops. Go back and try again.. ++ */ ++ if (unlikely(running)) { ++ cpu_relax(); ++ continue; ++ } ++ ++ /* ++ * It's not enough that it's not actively running, ++ * it must be off the runqueue _entirely_, and not ++ * preempted! ++ * ++ * So if it was still runnable (but just not actively ++ * running right now), it's preempted, and we should ++ * yield - it could be a while. ++ */ ++ if (unlikely(queued)) { ++ ktime_t to = NSEC_PER_SEC / HZ; ++ ++ set_current_state(TASK_UNINTERRUPTIBLE); ++ schedule_hrtimeout(&to, HRTIMER_MODE_REL); ++ continue; ++ } ++ ++ /* ++ * Ahh, all good. It wasn't running, and it wasn't ++ * runnable, which means that it will never become ++ * running in the future either. We're all done! ++ */ ++ break; ++ } ++ ++ return ncsw; ++} ++ ++/*** ++ * kick_process - kick a running thread to enter/exit the kernel ++ * @p: the to-be-kicked thread ++ * ++ * Cause a process which is running on another CPU to enter ++ * kernel-mode, without any delay. (to get signals handled.) ++ * ++ * NOTE: this function doesn't have to take the runqueue lock, ++ * because all it wants to ensure is that the remote task enters ++ * the kernel. If the IPI races and the task has been migrated ++ * to another CPU then no harm is done and the purpose has been ++ * achieved as well. ++ */ ++void kick_process(struct task_struct *p) ++{ ++ int cpu; ++ ++ preempt_disable(); ++ cpu = task_cpu(p); ++ if ((cpu != smp_processor_id()) && task_curr(p)) ++ smp_sched_reschedule(cpu); ++ preempt_enable(); ++} ++EXPORT_SYMBOL_GPL(kick_process); ++#endif ++ ++/* ++ * RT tasks preempt purely on priority. SCHED_NORMAL tasks preempt on the ++ * basis of earlier deadlines. SCHED_IDLEPRIO don't preempt anything else or ++ * between themselves, they cooperatively multitask. An idle rq scores as ++ * prio PRIO_LIMIT so it is always preempted. ++ */ ++static inline bool ++can_preempt(struct task_struct *p, int prio, u64 deadline) ++{ ++ /* Better static priority RT task or better policy preemption */ ++ if (p->prio < prio) ++ return true; ++ if (p->prio > prio) ++ return false; ++ if (p->policy == SCHED_BATCH) ++ return false; ++ /* SCHED_NORMAL and ISO will preempt based on deadline */ ++ if (!deadline_before(p->deadline, deadline)) ++ return false; ++ return true; ++} ++ ++#ifdef CONFIG_SMP ++ ++static inline bool is_per_cpu_kthread(struct task_struct *p) ++{ ++ if (!(p->flags & PF_KTHREAD)) ++ return false; ++ ++ if (p->nr_cpus_allowed != 1) ++ return false; ++ ++ return true; ++} ++ ++/* ++ * Per-CPU kthreads are allowed to run on !active && online CPUs, see ++ * __set_cpus_allowed_ptr(). ++ */ ++static inline bool is_cpu_allowed(struct task_struct *p, int cpu) ++{ ++ if (!cpumask_test_cpu(cpu, p->cpus_ptr)) ++ return false; ++ ++ if (is_per_cpu_kthread(p)) ++ return cpu_online(cpu); ++ ++ return cpu_active(cpu); ++} ++ ++/* ++ * Check to see if p can run on cpu, and if not, whether there are any online ++ * CPUs it can run on instead. This only happens with the hotplug threads that ++ * bring up the CPUs. ++ */ ++static inline bool sched_other_cpu(struct task_struct *p, int cpu) ++{ ++ if (likely(cpumask_test_cpu(cpu, p->cpus_ptr))) ++ return false; ++ if (p->nr_cpus_allowed == 1) { ++ cpumask_t valid_mask; ++ ++ cpumask_and(&valid_mask, p->cpus_ptr, cpu_online_mask); ++ if (unlikely(cpumask_empty(&valid_mask))) ++ return false; ++ } ++ return true; ++} ++ ++static inline bool needs_other_cpu(struct task_struct *p, int cpu) ++{ ++ if (cpumask_test_cpu(cpu, p->cpus_ptr)) ++ return false; ++ return true; ++} ++ ++#define cpu_online_map (*(cpumask_t *)cpu_online_mask) ++ ++static void try_preempt(struct task_struct *p, struct rq *this_rq) ++{ ++ int i, this_entries = rq_load(this_rq); ++ cpumask_t tmp; ++ ++ if (suitable_idle_cpus(p) && resched_best_idle(p, task_cpu(p))) ++ return; ++ ++ /* IDLEPRIO tasks never preempt anything but idle */ ++ if (p->policy == SCHED_IDLEPRIO) ++ return; ++ ++ cpumask_and(&tmp, &cpu_online_map, p->cpus_ptr); ++ ++ for (i = 0; i < num_online_cpus(); i++) { ++ struct rq *rq = this_rq->cpu_order[i]; ++ ++ if (!cpumask_test_cpu(rq->cpu, &tmp)) ++ continue; ++ ++ if (!sched_interactive && rq != this_rq && rq_load(rq) <= this_entries) ++ continue; ++ if (smt_schedule(p, rq) && can_preempt(p, rq->rq_prio, rq->rq_deadline)) { ++ /* We set rq->preempting lockless, it's a hint only */ ++ rq->preempting = p; ++ resched_curr(rq); ++ return; ++ } ++ } ++} ++ ++static int __set_cpus_allowed_ptr(struct task_struct *p, ++ const struct cpumask *new_mask, bool check); ++#else /* CONFIG_SMP */ ++static inline bool needs_other_cpu(struct task_struct *p, int cpu) ++{ ++ return false; ++} ++ ++static void try_preempt(struct task_struct *p, struct rq *this_rq) ++{ ++ if (p->policy == SCHED_IDLEPRIO) ++ return; ++ if (can_preempt(p, uprq->rq_prio, uprq->rq_deadline)) ++ resched_curr(uprq); ++} ++ ++static inline int __set_cpus_allowed_ptr(struct task_struct *p, ++ const struct cpumask *new_mask, bool check) ++{ ++ return set_cpus_allowed_ptr(p, new_mask); ++} ++#endif /* CONFIG_SMP */ ++ ++/* ++ * wake flags ++ */ ++#define WF_SYNC 0x01 /* waker goes to sleep after wakeup */ ++#define WF_FORK 0x02 /* child wakeup after fork */ ++#define WF_MIGRATED 0x04 /* internal use, task got migrated */ ++ ++static void ++ttwu_stat(struct task_struct *p, int cpu, int wake_flags) ++{ ++ struct rq *rq; ++ ++ if (!schedstat_enabled()) ++ return; ++ ++ rq = this_rq(); ++ ++#ifdef CONFIG_SMP ++ if (cpu == rq->cpu) { ++ __schedstat_inc(rq->ttwu_local); ++ } else { ++ struct sched_domain *sd; ++ ++ rcu_read_lock(); ++ for_each_domain(rq->cpu, sd) { ++ if (cpumask_test_cpu(cpu, sched_domain_span(sd))) { ++ __schedstat_inc(sd->ttwu_wake_remote); ++ break; ++ } ++ } ++ rcu_read_unlock(); ++ } ++ ++#endif /* CONFIG_SMP */ ++ ++ __schedstat_inc(rq->ttwu_count); ++} ++ ++/* ++ * Mark the task runnable and perform wakeup-preemption. ++ */ ++static void ttwu_do_wakeup(struct rq *rq, struct task_struct *p, int wake_flags) ++{ ++ /* ++ * Sync wakeups (i.e. those types of wakeups where the waker ++ * has indicated that it will leave the CPU in short order) ++ * don't trigger a preemption if there are no idle cpus, ++ * instead waiting for current to deschedule. ++ */ ++ if (wake_flags & WF_SYNC) ++ resched_suitable_idle(p); ++ else ++ try_preempt(p, rq); ++ p->state = TASK_RUNNING; ++ trace_sched_wakeup(p); ++} ++ ++static void ++ttwu_do_activate(struct rq *rq, struct task_struct *p, int wake_flags) ++{ ++ int en_flags = ENQUEUE_WAKEUP; ++ ++ lockdep_assert_held(rq->lock); ++ ++#ifdef CONFIG_SMP ++ if (p->sched_contributes_to_load) ++ rq->nr_uninterruptible--; ++ ++ if (wake_flags & WF_MIGRATED) ++ en_flags |= ENQUEUE_MIGRATED; ++#endif ++ ++ activate_task(rq, p, en_flags); ++ ttwu_do_wakeup(rq, p, wake_flags); ++} ++ ++/* ++ * Called in case the task @p isn't fully descheduled from its runqueue, ++ * in this case we must do a remote wakeup. Its a 'light' wakeup though, ++ * since all we need to do is flip p->state to TASK_RUNNING, since ++ * the task is still ->on_rq. ++ */ ++static int ttwu_remote(struct task_struct *p, int wake_flags) ++{ ++ struct rq *rq; ++ int ret = 0; ++ ++ rq = __task_rq_lock(p, NULL); ++ if (likely(task_on_rq_queued(p))) { ++ ttwu_do_wakeup(rq, p, wake_flags); ++ ret = 1; ++ } ++ __task_rq_unlock(rq, NULL); ++ ++ return ret; ++} ++ ++#ifdef CONFIG_SMP ++void sched_ttwu_pending(void) ++{ ++ struct rq *rq = this_rq(); ++ struct llist_node *llist = llist_del_all(&rq->wake_list); ++ struct task_struct *p, *t; ++ struct rq_flags rf; ++ ++ if (!llist) ++ return; ++ ++ rq_lock_irqsave(rq, &rf); ++ ++ llist_for_each_entry_safe(p, t, llist, wake_entry) ++ ttwu_do_activate(rq, p, 0); ++ ++ rq_unlock_irqrestore(rq, &rf); ++} ++ ++void scheduler_ipi(void) ++{ ++ /* ++ * Fold TIF_NEED_RESCHED into the preempt_count; anybody setting ++ * TIF_NEED_RESCHED remotely (for the first time) will also send ++ * this IPI. ++ */ ++ preempt_fold_need_resched(); ++ ++ if (llist_empty(&this_rq()->wake_list) && (!idle_cpu(smp_processor_id()) || need_resched())) ++ return; ++ ++ /* ++ * Not all reschedule IPI handlers call irq_enter/irq_exit, since ++ * traditionally all their work was done from the interrupt return ++ * path. Now that we actually do some work, we need to make sure ++ * we do call them. ++ * ++ * Some archs already do call them, luckily irq_enter/exit nest ++ * properly. ++ * ++ * Arguably we should visit all archs and update all handlers, ++ * however a fair share of IPIs are still resched only so this would ++ * somewhat pessimize the simple resched case. ++ */ ++ irq_enter(); ++ sched_ttwu_pending(); ++ irq_exit(); ++} ++ ++static void ttwu_queue_remote(struct task_struct *p, int cpu, int wake_flags) ++{ ++ struct rq *rq = cpu_rq(cpu); ++ ++ if (llist_add(&p->wake_entry, &cpu_rq(cpu)->wake_list)) { ++ if (!set_nr_if_polling(rq->idle)) ++ smp_sched_reschedule(cpu); ++ else ++ trace_sched_wake_idle_without_ipi(cpu); ++ } ++} ++ ++void wake_up_if_idle(int cpu) ++{ ++ struct rq *rq = cpu_rq(cpu); ++ struct rq_flags rf; ++ ++ rcu_read_lock(); ++ ++ if (!is_idle_task(rcu_dereference(rq->curr))) ++ goto out; ++ ++ if (set_nr_if_polling(rq->idle)) { ++ trace_sched_wake_idle_without_ipi(cpu); ++ } else { ++ rq_lock_irqsave(rq, &rf); ++ if (likely(is_idle_task(rq->curr))) ++ smp_sched_reschedule(cpu); ++ /* Else cpu is not in idle, do nothing here */ ++ rq_unlock_irqrestore(rq, &rf); ++ } ++ ++out: ++ rcu_read_unlock(); ++} ++ ++static int valid_task_cpu(struct task_struct *p) ++{ ++ cpumask_t valid_mask; ++ ++ if (p->flags & PF_KTHREAD) ++ cpumask_and(&valid_mask, p->cpus_ptr, cpu_all_mask); ++ else ++ cpumask_and(&valid_mask, p->cpus_ptr, cpu_active_mask); ++ ++ if (unlikely(!cpumask_weight(&valid_mask))) { ++ /* We shouldn't be hitting this any more */ ++ printk(KERN_WARNING "SCHED: No cpumask for %s/%d weight %d\n", p->comm, ++ p->pid, cpumask_weight(p->cpus_ptr)); ++ return cpumask_any(p->cpus_ptr); ++ } ++ return cpumask_any(&valid_mask); ++} ++ ++/* ++ * For a task that's just being woken up we have a valuable balancing ++ * opportunity so choose the nearest cache most lightly loaded runqueue. ++ * Entered with rq locked and returns with the chosen runqueue locked. ++ */ ++static inline int select_best_cpu(struct task_struct *p) ++{ ++ unsigned int idlest = ~0U; ++ struct rq *rq = NULL; ++ int i; ++ ++ if (suitable_idle_cpus(p)) { ++ int cpu = task_cpu(p); ++ ++ if (unlikely(needs_other_cpu(p, cpu))) ++ cpu = valid_task_cpu(p); ++ rq = resched_best_idle(p, cpu); ++ if (likely(rq)) ++ return rq->cpu; ++ } ++ ++ for (i = 0; i < num_online_cpus(); i++) { ++ struct rq *other_rq = task_rq(p)->cpu_order[i]; ++ int entries; ++ ++ if (!other_rq->online) ++ continue; ++ if (needs_other_cpu(p, other_rq->cpu)) ++ continue; ++ entries = rq_load(other_rq); ++ if (entries >= idlest) ++ continue; ++ idlest = entries; ++ rq = other_rq; ++ } ++ if (unlikely(!rq)) ++ return task_cpu(p); ++ return rq->cpu; ++} ++#else /* CONFIG_SMP */ ++static int valid_task_cpu(struct task_struct *p) ++{ ++ return 0; ++} ++ ++static inline int select_best_cpu(struct task_struct *p) ++{ ++ return 0; ++} ++ ++static struct rq *resched_best_idle(struct task_struct *p, int cpu) ++{ ++ return NULL; ++} ++#endif /* CONFIG_SMP */ ++ ++static void ttwu_queue(struct task_struct *p, int cpu, int wake_flags) ++{ ++ struct rq *rq = cpu_rq(cpu); ++ ++#if defined(CONFIG_SMP) ++ if (!cpus_share_cache(smp_processor_id(), cpu)) { ++ sched_clock_cpu(cpu); /* Sync clocks across CPUs */ ++ ttwu_queue_remote(p, cpu, wake_flags); ++ return; ++ } ++#endif ++ rq_lock(rq); ++ ttwu_do_activate(rq, p, wake_flags); ++ rq_unlock(rq); ++} ++ ++/*** ++ * try_to_wake_up - wake up a thread ++ * @p: the thread to be awakened ++ * @state: the mask of task states that can be woken ++ * @wake_flags: wake modifier flags (WF_*) ++ * ++ * Put it on the run-queue if it's not already there. The "current" ++ * thread is always on the run-queue (except when the actual ++ * re-schedule is in progress), and as such you're allowed to do ++ * the simpler "current->state = TASK_RUNNING" to mark yourself ++ * runnable without the overhead of this. ++ * ++ * Return: %true if @p was woken up, %false if it was already running. ++ * or @state didn't match @p's state. ++ */ ++static int ++try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags) ++{ ++ unsigned long flags; ++ int cpu, success = 0; ++ ++ preempt_disable(); ++ if (p == current) { ++ /* ++ * We're waking current, this means 'p->on_rq' and 'task_cpu(p) ++ * == smp_processor_id()'. Together this means we can special ++ * case the whole 'p->on_rq && ttwu_remote()' case below ++ * without taking any locks. ++ * ++ * In particular: ++ * - we rely on Program-Order guarantees for all the ordering, ++ * - we're serialized against set_special_state() by virtue of ++ * it disabling IRQs (this allows not taking ->pi_lock). ++ */ ++ if (!(p->state & state)) ++ goto out; ++ ++ success = 1; ++ cpu = task_cpu(p); ++ trace_sched_waking(p); ++ p->state = TASK_RUNNING; ++ trace_sched_wakeup(p); ++ goto out; ++ } ++ ++ /* ++ * If we are going to wake up a thread waiting for CONDITION we ++ * need to ensure that CONDITION=1 done by the caller can not be ++ * reordered with p->state check below. This pairs with mb() in ++ * set_current_state() the waiting thread does. ++ */ ++ raw_spin_lock_irqsave(&p->pi_lock, flags); ++ smp_mb__after_spinlock(); ++ if (!(p->state & state)) ++ goto unlock; ++ ++ trace_sched_waking(p); ++ ++ /* We're going to change ->state: */ ++ success = 1; ++ cpu = task_cpu(p); ++ ++ /* ++ * Ensure we load p->on_rq _after_ p->state, otherwise it would ++ * be possible to, falsely, observe p->on_rq == 0 and get stuck ++ * in smp_cond_load_acquire() below. ++ * ++ * sched_ttwu_pending() try_to_wake_up() ++ * STORE p->on_rq = 1 LOAD p->state ++ * UNLOCK rq->lock ++ * ++ * __schedule() (switch to task 'p') ++ * LOCK rq->lock smp_rmb(); ++ * smp_mb__after_spinlock(); ++ * UNLOCK rq->lock ++ * ++ * [task p] ++ * STORE p->state = UNINTERRUPTIBLE LOAD p->on_rq ++ * ++ * Pairs with the LOCK+smp_mb__after_spinlock() on rq->lock in ++ * __schedule(). See the comment for smp_mb__after_spinlock(). ++ */ ++ smp_rmb(); ++ if (p->on_rq && ttwu_remote(p, wake_flags)) ++ goto unlock; ++ ++#ifdef CONFIG_SMP ++ /* ++ * Ensure we load p->on_cpu _after_ p->on_rq, otherwise it would be ++ * possible to, falsely, observe p->on_cpu == 0. ++ * ++ * One must be running (->on_cpu == 1) in order to remove oneself ++ * from the runqueue. ++ * ++ * __schedule() (switch to task 'p') try_to_wake_up() ++ * STORE p->on_cpu = 1 LOAD p->on_rq ++ * UNLOCK rq->lock ++ * ++ * __schedule() (put 'p' to sleep) ++ * LOCK rq->lock smp_rmb(); ++ * smp_mb__after_spinlock(); ++ * STORE p->on_rq = 0 LOAD p->on_cpu ++ * ++ * Pairs with the LOCK+smp_mb__after_spinlock() on rq->lock in ++ * __schedule(). See the comment for smp_mb__after_spinlock(). ++ */ ++ smp_rmb(); ++ ++ /* ++ * If the owning (remote) CPU is still in the middle of schedule() with ++ * this task as prev, wait until its done referencing the task. ++ * ++ * Pairs with the smp_store_release() in finish_task(). ++ * ++ * This ensures that tasks getting woken will be fully ordered against ++ * their previous state and preserve Program Order. ++ */ ++ smp_cond_load_acquire(&p->on_cpu, !VAL); ++ ++ p->sched_contributes_to_load = !!task_contributes_to_load(p); ++ p->state = TASK_WAKING; ++ ++ if (p->in_iowait) { ++ delayacct_blkio_end(p); ++ atomic_dec(&task_rq(p)->nr_iowait); ++ } ++ ++ cpu = select_best_cpu(p); ++ if (task_cpu(p) != cpu) { ++ wake_flags |= WF_MIGRATED; ++ psi_ttwu_dequeue(p); ++ set_task_cpu(p, cpu); ++ } ++ ++#else /* CONFIG_SMP */ ++ ++ if (p->in_iowait) { ++ delayacct_blkio_end(p); ++ atomic_dec(&task_rq(p)->nr_iowait); ++ } ++ ++#endif /* CONFIG_SMP */ ++ ++ ttwu_queue(p, cpu, wake_flags); ++unlock: ++ raw_spin_unlock_irqrestore(&p->pi_lock, flags); ++out: ++ if (success) ++ ttwu_stat(p, cpu, wake_flags); ++ preempt_enable(); ++ ++ return success; ++} ++ ++/** ++ * wake_up_process - Wake up a specific process ++ * @p: The process to be woken up. ++ * ++ * Attempt to wake up the nominated process and move it to the set of runnable ++ * processes. ++ * ++ * Return: 1 if the process was woken up, 0 if it was already running. ++ * ++ * This function executes a full memory barrier before accessing the task state. ++ */ ++int wake_up_process(struct task_struct *p) ++{ ++ return try_to_wake_up(p, TASK_NORMAL, 0); ++} ++EXPORT_SYMBOL(wake_up_process); ++ ++int wake_up_state(struct task_struct *p, unsigned int state) ++{ ++ return try_to_wake_up(p, state, 0); ++} ++ ++static void time_slice_expired(struct task_struct *p, struct rq *rq); ++ ++/* ++ * Perform scheduler related setup for a newly forked process p. ++ * p is forked by current. ++ */ ++int sched_fork(unsigned long __maybe_unused clone_flags, struct task_struct *p) ++{ ++ unsigned long flags; ++ ++#ifdef CONFIG_PREEMPT_NOTIFIERS ++ INIT_HLIST_HEAD(&p->preempt_notifiers); ++#endif ++ ++#ifdef CONFIG_COMPACTION ++ p->capture_control = NULL; ++#endif ++ ++ /* ++ * We mark the process as NEW here. This guarantees that ++ * nobody will actually run it, and a signal or other external ++ * event cannot wake it up and insert it on the runqueue either. ++ */ ++ p->state = TASK_NEW; ++ ++ /* ++ * The process state is set to the same value of the process executing ++ * do_fork() code. That is running. This guarantees that nobody will ++ * actually run it, and a signal or other external event cannot wake ++ * it up and insert it on the runqueue either. ++ */ ++ ++ /* Should be reset in fork.c but done here for ease of MuQSS patching */ ++ p->on_cpu = ++ p->on_rq = ++ p->utime = ++ p->stime = ++ p->sched_time = ++ p->stime_ns = ++ p->utime_ns = 0; ++ skiplist_node_init(&p->node); ++ ++ /* ++ * Revert to default priority/policy on fork if requested. ++ */ ++ if (unlikely(p->sched_reset_on_fork)) { ++ if (p->policy == SCHED_FIFO || p->policy == SCHED_RR) { ++ p->policy = SCHED_NORMAL; ++ p->normal_prio = normal_prio(p); ++ } ++ ++ if (PRIO_TO_NICE(p->static_prio) < 0) { ++ p->static_prio = NICE_TO_PRIO(0); ++ p->normal_prio = p->static_prio; ++ } ++ ++ /* ++ * We don't need the reset flag anymore after the fork. It has ++ * fulfilled its duty: ++ */ ++ p->sched_reset_on_fork = 0; ++ } ++ ++ /* ++ * Silence PROVE_RCU. ++ */ ++ raw_spin_lock_irqsave(&p->pi_lock, flags); ++ set_task_cpu(p, smp_processor_id()); ++ raw_spin_unlock_irqrestore(&p->pi_lock, flags); ++ ++#ifdef CONFIG_SCHED_INFO ++ if (unlikely(sched_info_on())) ++ memset(&p->sched_info, 0, sizeof(p->sched_info)); ++#endif ++ init_task_preempt_count(p); ++ ++ return 0; ++} ++ ++#ifdef CONFIG_SCHEDSTATS ++ ++DEFINE_STATIC_KEY_FALSE(sched_schedstats); ++static bool __initdata __sched_schedstats = false; ++ ++static void set_schedstats(bool enabled) ++{ ++ if (enabled) ++ static_branch_enable(&sched_schedstats); ++ else ++ static_branch_disable(&sched_schedstats); ++} ++ ++void force_schedstat_enabled(void) ++{ ++ if (!schedstat_enabled()) { ++ pr_info("kernel profiling enabled schedstats, disable via kernel.sched_schedstats.\n"); ++ static_branch_enable(&sched_schedstats); ++ } ++} ++ ++static int __init setup_schedstats(char *str) ++{ ++ int ret = 0; ++ if (!str) ++ goto out; ++ ++ /* ++ * This code is called before jump labels have been set up, so we can't ++ * change the static branch directly just yet. Instead set a temporary ++ * variable so init_schedstats() can do it later. ++ */ ++ if (!strcmp(str, "enable")) { ++ __sched_schedstats = true; ++ ret = 1; ++ } else if (!strcmp(str, "disable")) { ++ __sched_schedstats = false; ++ ret = 1; ++ } ++out: ++ if (!ret) ++ pr_warn("Unable to parse schedstats=\n"); ++ ++ return ret; ++} ++__setup("schedstats=", setup_schedstats); ++ ++static void __init init_schedstats(void) ++{ ++ set_schedstats(__sched_schedstats); ++} ++ ++#ifdef CONFIG_PROC_SYSCTL ++int sysctl_schedstats(struct ctl_table *table, int write, ++ void __user *buffer, size_t *lenp, loff_t *ppos) ++{ ++ struct ctl_table t; ++ int err; ++ int state = static_branch_likely(&sched_schedstats); ++ ++ if (write && !capable(CAP_SYS_ADMIN)) ++ return -EPERM; ++ ++ t = *table; ++ t.data = &state; ++ err = proc_dointvec_minmax(&t, write, buffer, lenp, ppos); ++ if (err < 0) ++ return err; ++ if (write) ++ set_schedstats(state); ++ return err; ++} ++#endif /* CONFIG_PROC_SYSCTL */ ++#else /* !CONFIG_SCHEDSTATS */ ++static inline void init_schedstats(void) {} ++#endif /* CONFIG_SCHEDSTATS */ ++ ++static void update_cpu_clock_switch(struct rq *rq, struct task_struct *p); ++ ++static void account_task_cpu(struct rq *rq, struct task_struct *p) ++{ ++ update_clocks(rq); ++ /* This isn't really a context switch but accounting is the same */ ++ update_cpu_clock_switch(rq, p); ++ p->last_ran = rq->niffies; ++} ++ ++bool sched_smp_initialized __read_mostly; ++ ++static inline int hrexpiry_enabled(struct rq *rq) ++{ ++ if (unlikely(!cpu_active(cpu_of(rq)) || !sched_smp_initialized)) ++ return 0; ++ return hrtimer_is_hres_active(&rq->hrexpiry_timer); ++} ++ ++/* ++ * Use HR-timers to deliver accurate preemption points. ++ */ ++static inline void hrexpiry_clear(struct rq *rq) ++{ ++ if (!hrexpiry_enabled(rq)) ++ return; ++ if (hrtimer_active(&rq->hrexpiry_timer)) ++ hrtimer_cancel(&rq->hrexpiry_timer); ++} ++ ++/* ++ * High-resolution time_slice expiry. ++ * Runs from hardirq context with interrupts disabled. ++ */ ++static enum hrtimer_restart hrexpiry(struct hrtimer *timer) ++{ ++ struct rq *rq = container_of(timer, struct rq, hrexpiry_timer); ++ struct task_struct *p; ++ ++ /* This can happen during CPU hotplug / resume */ ++ if (unlikely(cpu_of(rq) != smp_processor_id())) ++ goto out; ++ ++ /* ++ * We're doing this without the runqueue lock but this should always ++ * be run on the local CPU. Time slice should run out in __schedule ++ * but we set it to zero here in case niffies is slightly less. ++ */ ++ p = rq->curr; ++ p->time_slice = 0; ++ __set_tsk_resched(p); ++out: ++ return HRTIMER_NORESTART; ++} ++ ++/* ++ * Called to set the hrexpiry timer state. ++ * ++ * called with irqs disabled from the local CPU only ++ */ ++static void hrexpiry_start(struct rq *rq, u64 delay) ++{ ++ if (!hrexpiry_enabled(rq)) ++ return; ++ ++ hrtimer_start(&rq->hrexpiry_timer, ns_to_ktime(delay), ++ HRTIMER_MODE_REL_PINNED); ++} ++ ++static void init_rq_hrexpiry(struct rq *rq) ++{ ++ hrtimer_init(&rq->hrexpiry_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); ++ rq->hrexpiry_timer.function = hrexpiry; ++} ++ ++static inline int rq_dither(struct rq *rq) ++{ ++ if (!hrexpiry_enabled(rq)) ++ return HALF_JIFFY_US; ++ return 0; ++} ++ ++/* ++ * wake_up_new_task - wake up a newly created task for the first time. ++ * ++ * This function will do some initial scheduler statistics housekeeping ++ * that must be done for every newly created context, then puts the task ++ * on the runqueue and wakes it. ++ */ ++void wake_up_new_task(struct task_struct *p) ++{ ++ struct task_struct *parent, *rq_curr; ++ struct rq *rq, *new_rq; ++ unsigned long flags; ++ ++ parent = p->parent; ++ ++ raw_spin_lock_irqsave(&p->pi_lock, flags); ++ p->state = TASK_RUNNING; ++ /* Task_rq can't change yet on a new task */ ++ new_rq = rq = task_rq(p); ++ if (unlikely(needs_other_cpu(p, task_cpu(p)))) { ++ set_task_cpu(p, valid_task_cpu(p)); ++ new_rq = task_rq(p); ++ } ++ ++ double_rq_lock(rq, new_rq); ++ rq_curr = rq->curr; ++ ++ /* ++ * Make sure we do not leak PI boosting priority to the child. ++ */ ++ p->prio = rq_curr->normal_prio; ++ ++ trace_sched_wakeup_new(p); ++ ++ /* ++ * Share the timeslice between parent and child, thus the ++ * total amount of pending timeslices in the system doesn't change, ++ * resulting in more scheduling fairness. If it's negative, it won't ++ * matter since that's the same as being 0. rq->rq_deadline is only ++ * modified within schedule() so it is always equal to ++ * current->deadline. ++ */ ++ account_task_cpu(rq, rq_curr); ++ p->last_ran = rq_curr->last_ran; ++ if (likely(rq_curr->policy != SCHED_FIFO)) { ++ rq_curr->time_slice /= 2; ++ if (rq_curr->time_slice < RESCHED_US) { ++ /* ++ * Forking task has run out of timeslice. Reschedule it and ++ * start its child with a new time slice and deadline. The ++ * child will end up running first because its deadline will ++ * be slightly earlier. ++ */ ++ __set_tsk_resched(rq_curr); ++ time_slice_expired(p, new_rq); ++ if (suitable_idle_cpus(p)) ++ resched_best_idle(p, task_cpu(p)); ++ else if (unlikely(rq != new_rq)) ++ try_preempt(p, new_rq); ++ } else { ++ p->time_slice = rq_curr->time_slice; ++ if (rq_curr == parent && rq == new_rq && !suitable_idle_cpus(p)) { ++ /* ++ * The VM isn't cloned, so we're in a good position to ++ * do child-runs-first in anticipation of an exec. This ++ * usually avoids a lot of COW overhead. ++ */ ++ __set_tsk_resched(rq_curr); ++ } else { ++ /* ++ * Adjust the hrexpiry since rq_curr will keep ++ * running and its timeslice has been shortened. ++ */ ++ hrexpiry_start(rq, US_TO_NS(rq_curr->time_slice)); ++ try_preempt(p, new_rq); ++ } ++ } ++ } else { ++ time_slice_expired(p, new_rq); ++ try_preempt(p, new_rq); ++ } ++ activate_task(new_rq, p, 0); ++ double_rq_unlock(rq, new_rq); ++ raw_spin_unlock_irqrestore(&p->pi_lock, flags); ++} ++ ++#ifdef CONFIG_PREEMPT_NOTIFIERS ++ ++static DEFINE_STATIC_KEY_FALSE(preempt_notifier_key); ++ ++void preempt_notifier_inc(void) ++{ ++ static_branch_inc(&preempt_notifier_key); ++} ++EXPORT_SYMBOL_GPL(preempt_notifier_inc); ++ ++void preempt_notifier_dec(void) ++{ ++ static_branch_dec(&preempt_notifier_key); ++} ++EXPORT_SYMBOL_GPL(preempt_notifier_dec); ++ ++/** ++ * preempt_notifier_register - tell me when current is being preempted & rescheduled ++ * @notifier: notifier struct to register ++ */ ++void preempt_notifier_register(struct preempt_notifier *notifier) ++{ ++ if (!static_branch_unlikely(&preempt_notifier_key)) ++ WARN(1, "registering preempt_notifier while notifiers disabled\n"); ++ ++ hlist_add_head(¬ifier->link, ¤t->preempt_notifiers); ++} ++EXPORT_SYMBOL_GPL(preempt_notifier_register); ++ ++/** ++ * preempt_notifier_unregister - no longer interested in preemption notifications ++ * @notifier: notifier struct to unregister ++ * ++ * This is *not* safe to call from within a preemption notifier. ++ */ ++void preempt_notifier_unregister(struct preempt_notifier *notifier) ++{ ++ hlist_del(¬ifier->link); ++} ++EXPORT_SYMBOL_GPL(preempt_notifier_unregister); ++ ++static void __fire_sched_in_preempt_notifiers(struct task_struct *curr) ++{ ++ struct preempt_notifier *notifier; ++ ++ hlist_for_each_entry(notifier, &curr->preempt_notifiers, link) ++ notifier->ops->sched_in(notifier, raw_smp_processor_id()); ++} ++ ++static __always_inline void fire_sched_in_preempt_notifiers(struct task_struct *curr) ++{ ++ if (static_branch_unlikely(&preempt_notifier_key)) ++ __fire_sched_in_preempt_notifiers(curr); ++} ++ ++static void ++__fire_sched_out_preempt_notifiers(struct task_struct *curr, ++ struct task_struct *next) ++{ ++ struct preempt_notifier *notifier; ++ ++ hlist_for_each_entry(notifier, &curr->preempt_notifiers, link) ++ notifier->ops->sched_out(notifier, next); ++} ++ ++static __always_inline void ++fire_sched_out_preempt_notifiers(struct task_struct *curr, ++ struct task_struct *next) ++{ ++ if (static_branch_unlikely(&preempt_notifier_key)) ++ __fire_sched_out_preempt_notifiers(curr, next); ++} ++ ++#else /* !CONFIG_PREEMPT_NOTIFIERS */ ++ ++static inline void fire_sched_in_preempt_notifiers(struct task_struct *curr) ++{ ++} ++ ++static inline void ++fire_sched_out_preempt_notifiers(struct task_struct *curr, ++ struct task_struct *next) ++{ ++} ++ ++#endif /* CONFIG_PREEMPT_NOTIFIERS */ ++ ++static inline void prepare_task(struct task_struct *next) ++{ ++ /* ++ * Claim the task as running, we do this before switching to it ++ * such that any running task will have this set. ++ */ ++ next->on_cpu = 1; ++} ++ ++static inline void finish_task(struct task_struct *prev) ++{ ++#ifdef CONFIG_SMP ++ /* ++ * After ->on_cpu is cleared, the task can be moved to a different CPU. ++ * We must ensure this doesn't happen until the switch is completely ++ * finished. ++ * ++ * In particular, the load of prev->state in finish_task_switch() must ++ * happen before this. ++ * ++ * Pairs with the smp_cond_load_acquire() in try_to_wake_up(). ++ */ ++ smp_store_release(&prev->on_cpu, 0); ++#endif ++} ++ ++static inline void ++prepare_lock_switch(struct rq *rq, struct task_struct *next) ++{ ++ /* ++ * Since the runqueue lock will be released by the next ++ * task (which is an invalid locking op but in the case ++ * of the scheduler it's an obvious special-case), so we ++ * do an early lockdep release here: ++ */ ++ spin_release(&rq->lock->dep_map, 1, _THIS_IP_); ++#ifdef CONFIG_DEBUG_SPINLOCK ++ /* this is a valid case when another task releases the spinlock */ ++ rq->lock->owner = next; ++#endif ++} ++ ++static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev) ++{ ++ /* ++ * If we are tracking spinlock dependencies then we have to ++ * fix up the runqueue lock - which gets 'carried over' from ++ * prev into current: ++ */ ++ spin_acquire(&rq->lock->dep_map, 0, 0, _THIS_IP_); ++ ++#ifdef CONFIG_SMP ++ /* ++ * If prev was marked as migrating to another CPU in return_task, drop ++ * the local runqueue lock but leave interrupts disabled and grab the ++ * remote lock we're migrating it to before enabling them. ++ */ ++ if (unlikely(task_on_rq_migrating(prev))) { ++ sched_info_dequeued(rq, prev); ++ /* ++ * We move the ownership of prev to the new cpu now. ttwu can't ++ * activate prev to the wrong cpu since it has to grab this ++ * runqueue in ttwu_remote. ++ */ ++#ifdef CONFIG_THREAD_INFO_IN_TASK ++ prev->cpu = prev->wake_cpu; ++#else ++ task_thread_info(prev)->cpu = prev->wake_cpu; ++#endif ++ raw_spin_unlock(rq->lock); ++ ++ raw_spin_lock(&prev->pi_lock); ++ rq = __task_rq_lock(prev, NULL); ++ /* Check that someone else hasn't already queued prev */ ++ if (likely(!task_queued(prev))) { ++ enqueue_task(rq, prev, 0); ++ prev->on_rq = TASK_ON_RQ_QUEUED; ++ /* Wake up the CPU if it's not already running */ ++ resched_if_idle(rq); ++ } ++ raw_spin_unlock(&prev->pi_lock); ++ } ++#endif ++ rq_unlock(rq); ++ ++ do_pending_softirq(rq, current); ++ ++ local_irq_enable(); ++} ++ ++#ifndef prepare_arch_switch ++# define prepare_arch_switch(next) do { } while (0) ++#endif ++#ifndef finish_arch_switch ++# define finish_arch_switch(prev) do { } while (0) ++#endif ++#ifndef finish_arch_post_lock_switch ++# define finish_arch_post_lock_switch() do { } while (0) ++#endif ++ ++/** ++ * prepare_task_switch - prepare to switch tasks ++ * @rq: the runqueue preparing to switch ++ * @next: the task we are going to switch to. ++ * ++ * This is called with the rq lock held and interrupts off. It must ++ * be paired with a subsequent finish_task_switch after the context ++ * switch. ++ * ++ * prepare_task_switch sets up locking and calls architecture specific ++ * hooks. ++ */ ++static inline void ++prepare_task_switch(struct rq *rq, struct task_struct *prev, ++ struct task_struct *next) ++{ ++ kcov_prepare_switch(prev); ++ sched_info_switch(rq, prev, next); ++ perf_event_task_sched_out(prev, next); ++ rseq_preempt(prev); ++ fire_sched_out_preempt_notifiers(prev, next); ++ prepare_task(next); ++ prepare_arch_switch(next); ++} ++ ++/** ++ * finish_task_switch - clean up after a task-switch ++ * @rq: runqueue associated with task-switch ++ * @prev: the thread we just switched away from. ++ * ++ * finish_task_switch must be called after the context switch, paired ++ * with a prepare_task_switch call before the context switch. ++ * finish_task_switch will reconcile locking set up by prepare_task_switch, ++ * and do any other architecture-specific cleanup actions. ++ * ++ * Note that we may have delayed dropping an mm in context_switch(). If ++ * so, we finish that here outside of the runqueue lock. (Doing it ++ * with the lock held can cause deadlocks; see schedule() for ++ * details.) ++ * ++ * The context switch have flipped the stack from under us and restored the ++ * local variables which were saved when this task called schedule() in the ++ * past. prev == current is still correct but we need to recalculate this_rq ++ * because prev may have moved to another CPU. ++ */ ++static void finish_task_switch(struct task_struct *prev) ++ __releases(rq->lock) ++{ ++ struct rq *rq = this_rq(); ++ struct mm_struct *mm = rq->prev_mm; ++ long prev_state; ++ ++ /* ++ * The previous task will have left us with a preempt_count of 2 ++ * because it left us after: ++ * ++ * schedule() ++ * preempt_disable(); // 1 ++ * __schedule() ++ * raw_spin_lock_irq(rq->lock) // 2 ++ * ++ * Also, see FORK_PREEMPT_COUNT. ++ */ ++ if (WARN_ONCE(preempt_count() != 2*PREEMPT_DISABLE_OFFSET, ++ "corrupted preempt_count: %s/%d/0x%x\n", ++ current->comm, current->pid, preempt_count())) ++ preempt_count_set(FORK_PREEMPT_COUNT); ++ ++ rq->prev_mm = NULL; ++ ++ /* ++ * A task struct has one reference for the use as "current". ++ * If a task dies, then it sets TASK_DEAD in tsk->state and calls ++ * schedule one last time. The schedule call will never return, and ++ * the scheduled task must drop that reference. ++ * ++ * We must observe prev->state before clearing prev->on_cpu (in ++ * finish_task), otherwise a concurrent wakeup can get prev ++ * running on another CPU and we could rave with its RUNNING -> DEAD ++ * transition, resulting in a double drop. ++ */ ++ prev_state = prev->state; ++ vtime_task_switch(prev); ++ perf_event_task_sched_in(prev, current); ++ finish_task(prev); ++ finish_lock_switch(rq, prev); ++ finish_arch_post_lock_switch(); ++ kcov_finish_switch(current); ++ ++ fire_sched_in_preempt_notifiers(current); ++ /* ++ * When switching through a kernel thread, the loop in ++ * membarrier_{private,global}_expedited() may have observed that ++ * kernel thread and not issued an IPI. It is therefore possible to ++ * schedule between user->kernel->user threads without passing though ++ * switch_mm(). Membarrier requires a barrier after storing to ++ * rq->curr, before returning to userspace, so provide them here: ++ * ++ * - a full memory barrier for {PRIVATE,GLOBAL}_EXPEDITED, implicitly ++ * provided by mmdrop(), ++ * - a sync_core for SYNC_CORE. ++ */ ++ if (mm) { ++ membarrier_mm_sync_core_before_usermode(mm); ++ mmdrop(mm); ++ } ++ if (unlikely(prev_state == TASK_DEAD)) { ++ /* ++ * Remove function-return probe instances associated with this ++ * task and put them back on the free list. ++ */ ++ kprobe_flush_task(prev); ++ ++ /* Task is done with its stack. */ ++ put_task_stack(prev); ++ ++ put_task_struct_rcu_user(prev); ++ } ++} ++ ++/** ++ * schedule_tail - first thing a freshly forked thread must call. ++ * @prev: the thread we just switched away from. ++ */ ++asmlinkage __visible void schedule_tail(struct task_struct *prev) ++{ ++ /* ++ * New tasks start with FORK_PREEMPT_COUNT, see there and ++ * finish_task_switch() for details. ++ * ++ * finish_task_switch() will drop rq->lock() and lower preempt_count ++ * and the preempt_enable() will end up enabling preemption (on ++ * PREEMPT_COUNT kernels). ++ */ ++ ++ finish_task_switch(prev); ++ preempt_enable(); ++ ++ if (current->set_child_tid) ++ put_user(task_pid_vnr(current), current->set_child_tid); ++ ++ calculate_sigpending(); ++} ++ ++/* ++ * context_switch - switch to the new MM and the new thread's register state. ++ */ ++static __always_inline void ++context_switch(struct rq *rq, struct task_struct *prev, ++ struct task_struct *next) ++{ ++ prepare_task_switch(rq, prev, next); ++ ++ /* ++ * For paravirt, this is coupled with an exit in switch_to to ++ * combine the page table reload and the switch backend into ++ * one hypercall. ++ */ ++ arch_start_context_switch(prev); ++ ++ /* ++ * kernel -> kernel lazy + transfer active ++ * user -> kernel lazy + mmgrab() active ++ * ++ * kernel -> user switch + mmdrop() active ++ * user -> user switch ++ */ ++ if (!next->mm) { // to kernel ++ enter_lazy_tlb(prev->active_mm, next); ++ ++ next->active_mm = prev->active_mm; ++ if (prev->mm) // from user ++ mmgrab(prev->active_mm); ++ else ++ prev->active_mm = NULL; ++ } else { // to user ++ membarrier_switch_mm(rq, prev->active_mm, next->mm); ++ /* ++ * sys_membarrier() requires an smp_mb() between setting ++ * rq->curr / membarrier_switch_mm() and returning to userspace. ++ * ++ * The below provides this either through switch_mm(), or in ++ * case 'prev->active_mm == next->mm' through ++ * finish_task_switch()'s mmdrop(). ++ */ ++ switch_mm_irqs_off(prev->active_mm, next->mm, next); ++ ++ if (!prev->mm) { // from kernel ++ /* will mmdrop() in finish_task_switch(). */ ++ rq->prev_mm = prev->active_mm; ++ prev->active_mm = NULL; ++ } ++ } ++ prepare_lock_switch(rq, next); ++ ++ /* Here we just switch the register state and the stack. */ ++ switch_to(prev, next, prev); ++ barrier(); ++ ++ finish_task_switch(prev); ++} ++ ++/* ++ * nr_running, nr_uninterruptible and nr_context_switches: ++ * ++ * externally visible scheduler statistics: current number of runnable ++ * threads, total number of context switches performed since bootup. ++ */ ++unsigned long nr_running(void) ++{ ++ unsigned long i, sum = 0; ++ ++ for_each_online_cpu(i) ++ sum += cpu_rq(i)->nr_running; ++ ++ return sum; ++} ++ ++static unsigned long nr_uninterruptible(void) ++{ ++ unsigned long i, sum = 0; ++ ++ for_each_online_cpu(i) ++ sum += cpu_rq(i)->nr_uninterruptible; ++ ++ return sum; ++} ++ ++/* ++ * Check if only the current task is running on the CPU. ++ * ++ * Caution: this function does not check that the caller has disabled ++ * preemption, thus the result might have a time-of-check-to-time-of-use ++ * race. The caller is responsible to use it correctly, for example: ++ * ++ * - from a non-preemptible section (of course) ++ * ++ * - from a thread that is bound to a single CPU ++ * ++ * - in a loop with very short iterations (e.g. a polling loop) ++ */ ++bool single_task_running(void) ++{ ++ if (rq_load(raw_rq()) == 1) ++ return true; ++ else ++ return false; ++} ++EXPORT_SYMBOL(single_task_running); ++ ++unsigned long long nr_context_switches(void) ++{ ++ int cpu; ++ unsigned long long sum = 0; ++ ++ for_each_possible_cpu(cpu) ++ sum += cpu_rq(cpu)->nr_switches; ++ ++ return sum; ++} ++ ++/* ++ * Consumers of these two interfaces, like for example the cpufreq menu ++ * governor are using nonsensical data. Boosting frequency for a CPU that has ++ * IO-wait which might not even end up running the task when it does become ++ * runnable. ++ */ ++ ++unsigned long nr_iowait_cpu(int cpu) ++{ ++ return atomic_read(&cpu_rq(cpu)->nr_iowait); ++} ++ ++/* ++ * IO-wait accounting, and how its mostly bollocks (on SMP). ++ * ++ * The idea behind IO-wait account is to account the idle time that we could ++ * have spend running if it were not for IO. That is, if we were to improve the ++ * storage performance, we'd have a proportional reduction in IO-wait time. ++ * ++ * This all works nicely on UP, where, when a task blocks on IO, we account ++ * idle time as IO-wait, because if the storage were faster, it could've been ++ * running and we'd not be idle. ++ * ++ * This has been extended to SMP, by doing the same for each CPU. This however ++ * is broken. ++ * ++ * Imagine for instance the case where two tasks block on one CPU, only the one ++ * CPU will have IO-wait accounted, while the other has regular idle. Even ++ * though, if the storage were faster, both could've ran at the same time, ++ * utilising both CPUs. ++ * ++ * This means, that when looking globally, the current IO-wait accounting on ++ * SMP is a lower bound, by reason of under accounting. ++ * ++ * Worse, since the numbers are provided per CPU, they are sometimes ++ * interpreted per CPU, and that is nonsensical. A blocked task isn't strictly ++ * associated with any one particular CPU, it can wake to another CPU than it ++ * blocked on. This means the per CPU IO-wait number is meaningless. ++ * ++ * Task CPU affinities can make all that even more 'interesting'. ++ */ ++ ++unsigned long nr_iowait(void) ++{ ++ unsigned long cpu, sum = 0; ++ ++ for_each_possible_cpu(cpu) ++ sum += nr_iowait_cpu(cpu); ++ ++ return sum; ++} ++ ++unsigned long nr_active(void) ++{ ++ return nr_running() + nr_uninterruptible(); ++} ++ ++/* Variables and functions for calc_load */ ++static unsigned long calc_load_update; ++unsigned long avenrun[3]; ++EXPORT_SYMBOL(avenrun); ++ ++/** ++ * get_avenrun - get the load average array ++ * @loads: pointer to dest load array ++ * @offset: offset to add ++ * @shift: shift count to shift the result left ++ * ++ * These values are estimates at best, so no need for locking. ++ */ ++void get_avenrun(unsigned long *loads, unsigned long offset, int shift) ++{ ++ loads[0] = (avenrun[0] + offset) << shift; ++ loads[1] = (avenrun[1] + offset) << shift; ++ loads[2] = (avenrun[2] + offset) << shift; ++} ++ ++/* ++ * calc_load - update the avenrun load estimates every LOAD_FREQ seconds. ++ */ ++void calc_global_load(unsigned long ticks) ++{ ++ long active; ++ ++ if (time_before(jiffies, READ_ONCE(calc_load_update))) ++ return; ++ active = nr_active() * FIXED_1; ++ ++ avenrun[0] = calc_load(avenrun[0], EXP_1, active); ++ avenrun[1] = calc_load(avenrun[1], EXP_5, active); ++ avenrun[2] = calc_load(avenrun[2], EXP_15, active); ++ ++ calc_load_update = jiffies + LOAD_FREQ; ++} ++ ++/** ++ * fixed_power_int - compute: x^n, in O(log n) time ++ * ++ * @x: base of the power ++ * @frac_bits: fractional bits of @x ++ * @n: power to raise @x to. ++ * ++ * By exploiting the relation between the definition of the natural power ++ * function: x^n := x*x*...*x (x multiplied by itself for n times), and ++ * the binary encoding of numbers used by computers: n := \Sum n_i * 2^i, ++ * (where: n_i \elem {0, 1}, the binary vector representing n), ++ * we find: x^n := x^(\Sum n_i * 2^i) := \Prod x^(n_i * 2^i), which is ++ * of course trivially computable in O(log_2 n), the length of our binary ++ * vector. ++ */ ++static unsigned long ++fixed_power_int(unsigned long x, unsigned int frac_bits, unsigned int n) ++{ ++ unsigned long result = 1UL << frac_bits; ++ ++ if (n) { ++ for (;;) { ++ if (n & 1) { ++ result *= x; ++ result += 1UL << (frac_bits - 1); ++ result >>= frac_bits; ++ } ++ n >>= 1; ++ if (!n) ++ break; ++ x *= x; ++ x += 1UL << (frac_bits - 1); ++ x >>= frac_bits; ++ } ++ } ++ ++ return result; ++} ++ ++/* ++ * a1 = a0 * e + a * (1 - e) ++ * ++ * a2 = a1 * e + a * (1 - e) ++ * = (a0 * e + a * (1 - e)) * e + a * (1 - e) ++ * = a0 * e^2 + a * (1 - e) * (1 + e) ++ * ++ * a3 = a2 * e + a * (1 - e) ++ * = (a0 * e^2 + a * (1 - e) * (1 + e)) * e + a * (1 - e) ++ * = a0 * e^3 + a * (1 - e) * (1 + e + e^2) ++ * ++ * ... ++ * ++ * an = a0 * e^n + a * (1 - e) * (1 + e + ... + e^n-1) [1] ++ * = a0 * e^n + a * (1 - e) * (1 - e^n)/(1 - e) ++ * = a0 * e^n + a * (1 - e^n) ++ * ++ * [1] application of the geometric series: ++ * ++ * n 1 - x^(n+1) ++ * S_n := \Sum x^i = ------------- ++ * i=0 1 - x ++ */ ++unsigned long ++calc_load_n(unsigned long load, unsigned long exp, ++ unsigned long active, unsigned int n) ++{ ++ return calc_load(load, fixed_power_int(exp, FSHIFT, n), active); ++} ++ ++DEFINE_PER_CPU(struct kernel_stat, kstat); ++DEFINE_PER_CPU(struct kernel_cpustat, kernel_cpustat); ++ ++EXPORT_PER_CPU_SYMBOL(kstat); ++EXPORT_PER_CPU_SYMBOL(kernel_cpustat); ++ ++#ifdef CONFIG_PARAVIRT ++static inline u64 steal_ticks(u64 steal) ++{ ++ if (unlikely(steal > NSEC_PER_SEC)) ++ return div_u64(steal, TICK_NSEC); ++ ++ return __iter_div_u64_rem(steal, TICK_NSEC, &steal); ++} ++#endif ++ ++#ifndef nsecs_to_cputime ++# define nsecs_to_cputime(__nsecs) nsecs_to_jiffies(__nsecs) ++#endif ++ ++/* ++ * On each tick, add the number of nanoseconds to the unbanked variables and ++ * once one tick's worth has accumulated, account it allowing for accurate ++ * sub-tick accounting and totals. Use the TICK_APPROX_NS to match the way we ++ * deduct nanoseconds. ++ */ ++static void pc_idle_time(struct rq *rq, struct task_struct *idle, unsigned long ns) ++{ ++ u64 *cpustat = kcpustat_this_cpu->cpustat; ++ unsigned long ticks; ++ ++ if (atomic_read(&rq->nr_iowait) > 0) { ++ rq->iowait_ns += ns; ++ if (rq->iowait_ns >= JIFFY_NS) { ++ ticks = NS_TO_JIFFIES(rq->iowait_ns); ++ cpustat[CPUTIME_IOWAIT] += (__force u64)TICK_APPROX_NS * ticks; ++ rq->iowait_ns %= JIFFY_NS; ++ } ++ } else { ++ rq->idle_ns += ns; ++ if (rq->idle_ns >= JIFFY_NS) { ++ ticks = NS_TO_JIFFIES(rq->idle_ns); ++ cpustat[CPUTIME_IDLE] += (__force u64)TICK_APPROX_NS * ticks; ++ rq->idle_ns %= JIFFY_NS; ++ } ++ } ++ acct_update_integrals(idle); ++} ++ ++static void pc_system_time(struct rq *rq, struct task_struct *p, ++ int hardirq_offset, unsigned long ns) ++{ ++ u64 *cpustat = kcpustat_this_cpu->cpustat; ++ unsigned long ticks; ++ ++ p->stime_ns += ns; ++ if (p->stime_ns >= JIFFY_NS) { ++ ticks = NS_TO_JIFFIES(p->stime_ns); ++ p->stime_ns %= JIFFY_NS; ++ p->stime += (__force u64)TICK_APPROX_NS * ticks; ++ account_group_system_time(p, TICK_APPROX_NS * ticks); ++ } ++ p->sched_time += ns; ++ account_group_exec_runtime(p, ns); ++ ++ if (hardirq_count() - hardirq_offset) { ++ rq->irq_ns += ns; ++ if (rq->irq_ns >= JIFFY_NS) { ++ ticks = NS_TO_JIFFIES(rq->irq_ns); ++ cpustat[CPUTIME_IRQ] += (__force u64)TICK_APPROX_NS * ticks; ++ rq->irq_ns %= JIFFY_NS; ++ } ++ } else if (in_serving_softirq()) { ++ rq->softirq_ns += ns; ++ if (rq->softirq_ns >= JIFFY_NS) { ++ ticks = NS_TO_JIFFIES(rq->softirq_ns); ++ cpustat[CPUTIME_SOFTIRQ] += (__force u64)TICK_APPROX_NS * ticks; ++ rq->softirq_ns %= JIFFY_NS; ++ } ++ } else { ++ rq->system_ns += ns; ++ if (rq->system_ns >= JIFFY_NS) { ++ ticks = NS_TO_JIFFIES(rq->system_ns); ++ cpustat[CPUTIME_SYSTEM] += (__force u64)TICK_APPROX_NS * ticks; ++ rq->system_ns %= JIFFY_NS; ++ } ++ } ++ acct_update_integrals(p); ++} ++ ++static void pc_user_time(struct rq *rq, struct task_struct *p, unsigned long ns) ++{ ++ u64 *cpustat = kcpustat_this_cpu->cpustat; ++ unsigned long ticks; ++ ++ p->utime_ns += ns; ++ if (p->utime_ns >= JIFFY_NS) { ++ ticks = NS_TO_JIFFIES(p->utime_ns); ++ p->utime_ns %= JIFFY_NS; ++ p->utime += (__force u64)TICK_APPROX_NS * ticks; ++ account_group_user_time(p, TICK_APPROX_NS * ticks); ++ } ++ p->sched_time += ns; ++ account_group_exec_runtime(p, ns); ++ ++ if (this_cpu_ksoftirqd() == p) { ++ /* ++ * ksoftirqd time do not get accounted in cpu_softirq_time. ++ * So, we have to handle it separately here. ++ */ ++ rq->softirq_ns += ns; ++ if (rq->softirq_ns >= JIFFY_NS) { ++ ticks = NS_TO_JIFFIES(rq->softirq_ns); ++ cpustat[CPUTIME_SOFTIRQ] += (__force u64)TICK_APPROX_NS * ticks; ++ rq->softirq_ns %= JIFFY_NS; ++ } ++ } ++ ++ if (task_nice(p) > 0 || idleprio_task(p)) { ++ rq->nice_ns += ns; ++ if (rq->nice_ns >= JIFFY_NS) { ++ ticks = NS_TO_JIFFIES(rq->nice_ns); ++ cpustat[CPUTIME_NICE] += (__force u64)TICK_APPROX_NS * ticks; ++ rq->nice_ns %= JIFFY_NS; ++ } ++ } else { ++ rq->user_ns += ns; ++ if (rq->user_ns >= JIFFY_NS) { ++ ticks = NS_TO_JIFFIES(rq->user_ns); ++ cpustat[CPUTIME_USER] += (__force u64)TICK_APPROX_NS * ticks; ++ rq->user_ns %= JIFFY_NS; ++ } ++ } ++ acct_update_integrals(p); ++} ++ ++/* ++ * This is called on clock ticks. ++ * Bank in p->sched_time the ns elapsed since the last tick or switch. ++ * CPU scheduler quota accounting is also performed here in microseconds. ++ */ ++static void update_cpu_clock_tick(struct rq *rq, struct task_struct *p) ++{ ++ s64 account_ns = rq->niffies - p->last_ran; ++ struct task_struct *idle = rq->idle; ++ ++ /* Accurate tick timekeeping */ ++ if (user_mode(get_irq_regs())) ++ pc_user_time(rq, p, account_ns); ++ else if (p != idle || (irq_count() != HARDIRQ_OFFSET)) { ++ pc_system_time(rq, p, HARDIRQ_OFFSET, account_ns); ++ } else ++ pc_idle_time(rq, idle, account_ns); ++ ++ /* time_slice accounting is done in usecs to avoid overflow on 32bit */ ++ if (p->policy != SCHED_FIFO && p != idle) ++ p->time_slice -= NS_TO_US(account_ns); ++ ++ p->last_ran = rq->niffies; ++} ++ ++/* ++ * This is called on context switches. ++ * Bank in p->sched_time the ns elapsed since the last tick or switch. ++ * CPU scheduler quota accounting is also performed here in microseconds. ++ */ ++static void update_cpu_clock_switch(struct rq *rq, struct task_struct *p) ++{ ++ s64 account_ns = rq->niffies - p->last_ran; ++ struct task_struct *idle = rq->idle; ++ ++ /* Accurate subtick timekeeping */ ++ if (p != idle) ++ pc_user_time(rq, p, account_ns); ++ else ++ pc_idle_time(rq, idle, account_ns); ++ ++ /* time_slice accounting is done in usecs to avoid overflow on 32bit */ ++ if (p->policy != SCHED_FIFO && p != idle) ++ p->time_slice -= NS_TO_US(account_ns); ++} ++ ++/* ++ * Return any ns on the sched_clock that have not yet been accounted in ++ * @p in case that task is currently running. ++ * ++ * Called with task_rq_lock(p) held. ++ */ ++static inline u64 do_task_delta_exec(struct task_struct *p, struct rq *rq) ++{ ++ u64 ns = 0; ++ ++ /* ++ * Must be ->curr _and_ ->on_rq. If dequeued, we would ++ * project cycles that may never be accounted to this ++ * thread, breaking clock_gettime(). ++ */ ++ if (p == rq->curr && task_on_rq_queued(p)) { ++ update_clocks(rq); ++ ns = rq->niffies - p->last_ran; ++ } ++ ++ return ns; ++} ++ ++/* ++ * Return accounted runtime for the task. ++ * Return separately the current's pending runtime that have not been ++ * accounted yet. ++ * ++ */ ++unsigned long long task_sched_runtime(struct task_struct *p) ++{ ++ struct rq_flags rf; ++ struct rq *rq; ++ u64 ns; ++ ++#if defined(CONFIG_64BIT) && defined(CONFIG_SMP) ++ /* ++ * 64-bit doesn't need locks to atomically read a 64-bit value. ++ * So we have a optimisation chance when the task's delta_exec is 0. ++ * Reading ->on_cpu is racy, but this is ok. ++ * ++ * If we race with it leaving CPU, we'll take a lock. So we're correct. ++ * If we race with it entering CPU, unaccounted time is 0. This is ++ * indistinguishable from the read occurring a few cycles earlier. ++ * If we see ->on_cpu without ->on_rq, the task is leaving, and has ++ * been accounted, so we're correct here as well. ++ */ ++ if (!p->on_cpu || !task_on_rq_queued(p)) ++ return tsk_seruntime(p); ++#endif ++ ++ rq = task_rq_lock(p, &rf); ++ ns = p->sched_time + do_task_delta_exec(p, rq); ++ task_rq_unlock(rq, p, &rf); ++ ++ return ns; ++} ++ ++/* ++ * Functions to test for when SCHED_ISO tasks have used their allocated ++ * quota as real time scheduling and convert them back to SCHED_NORMAL. All ++ * data is modified only by the local runqueue during scheduler_tick with ++ * interrupts disabled. ++ */ ++ ++/* ++ * Test if SCHED_ISO tasks have run longer than their alloted period as RT ++ * tasks and set the refractory flag if necessary. There is 10% hysteresis ++ * for unsetting the flag. 115/128 is ~90/100 as a fast shift instead of a ++ * slow division. ++ */ ++static inline void iso_tick(struct rq *rq) ++{ ++ rq->iso_ticks = rq->iso_ticks * (ISO_PERIOD - 1) / ISO_PERIOD; ++ rq->iso_ticks += 100; ++ if (rq->iso_ticks > ISO_PERIOD * sched_iso_cpu) { ++ rq->iso_refractory = true; ++ if (unlikely(rq->iso_ticks > ISO_PERIOD * 100)) ++ rq->iso_ticks = ISO_PERIOD * 100; ++ } ++} ++ ++/* No SCHED_ISO task was running so decrease rq->iso_ticks */ ++static inline void no_iso_tick(struct rq *rq, int ticks) ++{ ++ if (rq->iso_ticks > 0 || rq->iso_refractory) { ++ rq->iso_ticks = rq->iso_ticks * (ISO_PERIOD - ticks) / ISO_PERIOD; ++ if (rq->iso_ticks < ISO_PERIOD * (sched_iso_cpu * 115 / 128)) { ++ rq->iso_refractory = false; ++ if (unlikely(rq->iso_ticks < 0)) ++ rq->iso_ticks = 0; ++ } ++ } ++} ++ ++/* This manages tasks that have run out of timeslice during a scheduler_tick */ ++static void task_running_tick(struct rq *rq) ++{ ++ struct task_struct *p = rq->curr; ++ ++ /* ++ * If a SCHED_ISO task is running we increment the iso_ticks. In ++ * order to prevent SCHED_ISO tasks from causing starvation in the ++ * presence of true RT tasks we account those as iso_ticks as well. ++ */ ++ if (rt_task(p) || task_running_iso(p)) ++ iso_tick(rq); ++ else ++ no_iso_tick(rq, 1); ++ ++ /* SCHED_FIFO tasks never run out of timeslice. */ ++ if (p->policy == SCHED_FIFO) ++ return; ++ ++ if (iso_task(p)) { ++ if (task_running_iso(p)) { ++ if (rq->iso_refractory) { ++ /* ++ * SCHED_ISO task is running as RT and limit ++ * has been hit. Force it to reschedule as ++ * SCHED_NORMAL by zeroing its time_slice ++ */ ++ p->time_slice = 0; ++ } ++ } else if (!rq->iso_refractory) { ++ /* Can now run again ISO. Reschedule to pick up prio */ ++ goto out_resched; ++ } ++ } ++ ++ /* ++ * Tasks that were scheduled in the first half of a tick are not ++ * allowed to run into the 2nd half of the next tick if they will ++ * run out of time slice in the interim. Otherwise, if they have ++ * less than RESCHED_US μs of time slice left they will be rescheduled. ++ * Dither is used as a backup for when hrexpiry is disabled or high res ++ * timers not configured in. ++ */ ++ if (p->time_slice - rq->dither >= RESCHED_US) ++ return; ++out_resched: ++ rq_lock(rq); ++ __set_tsk_resched(p); ++ rq_unlock(rq); ++} ++ ++static inline void task_tick(struct rq *rq) ++{ ++ if (!rq_idle(rq)) ++ task_running_tick(rq); ++ else if (rq->last_jiffy > rq->last_scheduler_tick) ++ no_iso_tick(rq, rq->last_jiffy - rq->last_scheduler_tick); ++} ++ ++#ifdef CONFIG_NO_HZ_FULL ++/* ++ * We can stop the timer tick any time highres timers are active since ++ * we rely entirely on highres timeouts for task expiry rescheduling. ++ */ ++static void sched_stop_tick(struct rq *rq, int cpu) ++{ ++ if (!hrexpiry_enabled(rq)) ++ return; ++ if (!tick_nohz_full_enabled()) ++ return; ++ if (!tick_nohz_full_cpu(cpu)) ++ return; ++ tick_nohz_dep_clear_cpu(cpu, TICK_DEP_BIT_SCHED); ++} ++ ++static inline void sched_start_tick(struct rq *rq, int cpu) ++{ ++ tick_nohz_dep_set_cpu(cpu, TICK_DEP_BIT_SCHED); ++} ++ ++struct tick_work { ++ int cpu; ++ atomic_t state; ++ struct delayed_work work; ++}; ++/* Values for ->state, see diagram below. */ ++#define TICK_SCHED_REMOTE_OFFLINE 0 ++#define TICK_SCHED_REMOTE_OFFLINING 1 ++#define TICK_SCHED_REMOTE_RUNNING 2 ++ ++/* ++ * State diagram for ->state: ++ * ++ * ++ * TICK_SCHED_REMOTE_OFFLINE ++ * | ^ ++ * | | ++ * | | sched_tick_remote() ++ * | | ++ * | | ++ * +--TICK_SCHED_REMOTE_OFFLINING ++ * | ^ ++ * | | ++ * sched_tick_start() | | sched_tick_stop() ++ * | | ++ * V | ++ * TICK_SCHED_REMOTE_RUNNING ++ * ++ * ++ * Other transitions get WARN_ON_ONCE(), except that sched_tick_remote() ++ * and sched_tick_start() are happy to leave the state in RUNNING. ++ */ ++ ++static struct tick_work __percpu *tick_work_cpu; ++ ++static void sched_tick_remote(struct work_struct *work) ++{ ++ struct delayed_work *dwork = to_delayed_work(work); ++ struct tick_work *twork = container_of(dwork, struct tick_work, work); ++ int cpu = twork->cpu; ++ struct rq *rq = cpu_rq(cpu); ++ struct task_struct *curr; ++ u64 delta; ++ int os; ++ ++ /* ++ * Handle the tick only if it appears the remote CPU is running in full ++ * dynticks mode. The check is racy by nature, but missing a tick or ++ * having one too much is no big deal because the scheduler tick updates ++ * statistics and checks timeslices in a time-independent way, regardless ++ * of when exactly it is running. ++ */ ++ if (idle_cpu(cpu) || !tick_nohz_tick_stopped_cpu(cpu)) ++ goto out_requeue; ++ ++ rq_lock_irq(rq); ++ curr = rq->curr; ++ if (is_idle_task(curr) || cpu_is_offline(cpu)) ++ goto out_unlock; ++ ++ update_rq_clock(rq); ++ delta = rq_clock_task(rq) - curr->last_ran; ++ ++ /* ++ * Make sure the next tick runs within a reasonable ++ * amount of time. ++ */ ++ WARN_ON_ONCE(delta > (u64)NSEC_PER_SEC * 3); ++ task_tick(rq); ++ ++out_unlock: ++ rq_unlock_irq(rq, NULL); ++ ++out_requeue: ++ /* ++ * Run the remote tick once per second (1Hz). This arbitrary ++ * frequency is large enough to avoid overload but short enough ++ * to keep scheduler internal stats reasonably up to date. But ++ * first update state to reflect hotplug activity if required. ++ */ ++ os = atomic_fetch_add_unless(&twork->state, -1, TICK_SCHED_REMOTE_RUNNING); ++ WARN_ON_ONCE(os == TICK_SCHED_REMOTE_OFFLINE); ++ if (os == TICK_SCHED_REMOTE_RUNNING) ++ queue_delayed_work(system_unbound_wq, dwork, HZ); ++} ++ ++static void sched_tick_start(int cpu) ++{ ++ struct tick_work *twork; ++ int os; ++ ++ if (housekeeping_cpu(cpu, HK_FLAG_TICK)) ++ return; ++ ++ WARN_ON_ONCE(!tick_work_cpu); ++ ++ twork = per_cpu_ptr(tick_work_cpu, cpu); ++ os = atomic_xchg(&twork->state, TICK_SCHED_REMOTE_RUNNING); ++ WARN_ON_ONCE(os == TICK_SCHED_REMOTE_RUNNING); ++ if (os == TICK_SCHED_REMOTE_OFFLINE) { ++ twork->cpu = cpu; ++ INIT_DELAYED_WORK(&twork->work, sched_tick_remote); ++ queue_delayed_work(system_unbound_wq, &twork->work, HZ); ++ } ++} ++ ++#ifdef CONFIG_HOTPLUG_CPU ++static void sched_tick_stop(int cpu) ++{ ++ struct tick_work *twork; ++ int os; ++ ++ if (housekeeping_cpu(cpu, HK_FLAG_TICK)) ++ return; ++ ++ WARN_ON_ONCE(!tick_work_cpu); ++ ++ twork = per_cpu_ptr(tick_work_cpu, cpu); ++ /* There cannot be competing actions, but don't rely on stop-machine. */ ++ os = atomic_xchg(&twork->state, TICK_SCHED_REMOTE_OFFLINING); ++ WARN_ON_ONCE(os != TICK_SCHED_REMOTE_RUNNING); ++ /* Don't cancel, as this would mess up the state machine. */ ++} ++#endif /* CONFIG_HOTPLUG_CPU */ ++ ++int __init sched_tick_offload_init(void) ++{ ++ tick_work_cpu = alloc_percpu(struct tick_work); ++ BUG_ON(!tick_work_cpu); ++ return 0; ++} ++ ++#else /* !CONFIG_NO_HZ_FULL */ ++static inline void sched_stop_tick(struct rq *rq, int cpu) {} ++static inline void sched_start_tick(struct rq *rq, int cpu) {} ++static inline void sched_tick_start(int cpu) { } ++static inline void sched_tick_stop(int cpu) { } ++#endif ++ ++/* ++ * This function gets called by the timer code, with HZ frequency. ++ * We call it with interrupts disabled. ++ */ ++void scheduler_tick(void) ++{ ++ int cpu __maybe_unused = smp_processor_id(); ++ struct rq *rq = cpu_rq(cpu); ++ ++ sched_clock_tick(); ++ update_clocks(rq); ++ update_load_avg(rq, 0); ++ update_cpu_clock_tick(rq, rq->curr); ++ task_tick(rq); ++ rq->last_scheduler_tick = rq->last_jiffy; ++ rq->last_tick = rq->clock; ++ psi_task_tick(rq); ++ perf_event_task_tick(); ++ sched_stop_tick(rq, cpu); ++} ++ ++#if defined(CONFIG_PREEMPTION) && (defined(CONFIG_DEBUG_PREEMPT) || \ ++ defined(CONFIG_TRACE_PREEMPT_TOGGLE)) ++/* ++ * If the value passed in is equal to the current preempt count ++ * then we just disabled preemption. Start timing the latency. ++ */ ++static inline void preempt_latency_start(int val) ++{ ++ if (preempt_count() == val) { ++ unsigned long ip = get_lock_parent_ip(); ++#ifdef CONFIG_DEBUG_PREEMPT ++ current->preempt_disable_ip = ip; ++#endif ++ trace_preempt_off(CALLER_ADDR0, ip); ++ } ++} ++ ++void preempt_count_add(int val) ++{ ++#ifdef CONFIG_DEBUG_PREEMPT ++ /* ++ * Underflow? ++ */ ++ if (DEBUG_LOCKS_WARN_ON((preempt_count() < 0))) ++ return; ++#endif ++ __preempt_count_add(val); ++#ifdef CONFIG_DEBUG_PREEMPT ++ /* ++ * Spinlock count overflowing soon? ++ */ ++ DEBUG_LOCKS_WARN_ON((preempt_count() & PREEMPT_MASK) >= ++ PREEMPT_MASK - 10); ++#endif ++ preempt_latency_start(val); ++} ++EXPORT_SYMBOL(preempt_count_add); ++NOKPROBE_SYMBOL(preempt_count_add); ++ ++/* ++ * If the value passed in equals to the current preempt count ++ * then we just enabled preemption. Stop timing the latency. ++ */ ++static inline void preempt_latency_stop(int val) ++{ ++ if (preempt_count() == val) ++ trace_preempt_on(CALLER_ADDR0, get_lock_parent_ip()); ++} ++ ++void preempt_count_sub(int val) ++{ ++#ifdef CONFIG_DEBUG_PREEMPT ++ /* ++ * Underflow? ++ */ ++ if (DEBUG_LOCKS_WARN_ON(val > preempt_count())) ++ return; ++ /* ++ * Is the spinlock portion underflowing? ++ */ ++ if (DEBUG_LOCKS_WARN_ON((val < PREEMPT_MASK) && ++ !(preempt_count() & PREEMPT_MASK))) ++ return; ++#endif ++ ++ preempt_latency_stop(val); ++ __preempt_count_sub(val); ++} ++EXPORT_SYMBOL(preempt_count_sub); ++NOKPROBE_SYMBOL(preempt_count_sub); ++ ++#else ++static inline void preempt_latency_start(int val) { } ++static inline void preempt_latency_stop(int val) { } ++#endif ++ ++static inline unsigned long get_preempt_disable_ip(struct task_struct *p) ++{ ++#ifdef CONFIG_DEBUG_PREEMPT ++ return p->preempt_disable_ip; ++#else ++ return 0; ++#endif ++} ++ ++/* ++ * The time_slice is only refilled when it is empty and that is when we set a ++ * new deadline. Make sure update_clocks has been called recently to update ++ * rq->niffies. ++ */ ++static void time_slice_expired(struct task_struct *p, struct rq *rq) ++{ ++ p->time_slice = timeslice(); ++ p->deadline = rq->niffies + task_deadline_diff(p); ++#ifdef CONFIG_SMT_NICE ++ if (!p->mm) ++ p->smt_bias = 0; ++ else if (rt_task(p)) ++ p->smt_bias = 1 << 30; ++ else if (task_running_iso(p)) ++ p->smt_bias = 1 << 29; ++ else if (idleprio_task(p)) { ++ if (task_running_idle(p)) ++ p->smt_bias = 0; ++ else ++ p->smt_bias = 1; ++ } else if (--p->smt_bias < 1) ++ p->smt_bias = MAX_PRIO - p->static_prio; ++#endif ++} ++ ++/* ++ * Timeslices below RESCHED_US are considered as good as expired as there's no ++ * point rescheduling when there's so little time left. SCHED_BATCH tasks ++ * have been flagged be not latency sensitive and likely to be fully CPU ++ * bound so every time they're rescheduled they have their time_slice ++ * refilled, but get a new later deadline to have little effect on ++ * SCHED_NORMAL tasks. ++ ++ */ ++static inline void check_deadline(struct task_struct *p, struct rq *rq) ++{ ++ if (p->time_slice < RESCHED_US || batch_task(p)) ++ time_slice_expired(p, rq); ++} ++ ++/* ++ * Task selection with skiplists is a simple matter of picking off the first ++ * task in the sorted list, an O(1) operation. The lookup is amortised O(1) ++ * being bound to the number of processors. ++ * ++ * Runqueues are selectively locked based on their unlocked data and then ++ * unlocked if not needed. At most 3 locks will be held at any time and are ++ * released as soon as they're no longer needed. All balancing between CPUs ++ * is thus done here in an extremely simple first come best fit manner. ++ * ++ * This iterates over runqueues in cache locality order. In interactive mode ++ * it iterates over all CPUs and finds the task with the best key/deadline. ++ * In non-interactive mode it will only take a task if it's from the current ++ * runqueue or a runqueue with more tasks than the current one with a better ++ * key/deadline. ++ */ ++#ifdef CONFIG_SMP ++static inline struct task_struct ++*earliest_deadline_task(struct rq *rq, int cpu, struct task_struct *idle) ++{ ++ struct rq *locked = NULL, *chosen = NULL; ++ struct task_struct *edt = idle; ++ int i, best_entries = 0; ++ u64 best_key = ~0ULL; ++ ++ for (i = 0; i < total_runqueues; i++) { ++ struct rq *other_rq = rq_order(rq, i); ++ skiplist_node *next; ++ int entries; ++ ++ entries = other_rq->sl->entries; ++ /* ++ * Check for queued entres lockless first. The local runqueue ++ * is locked so entries will always be accurate. ++ */ ++ if (!sched_interactive) { ++ /* ++ * Don't reschedule balance across nodes unless the CPU ++ * is idle. ++ */ ++ if (edt != idle && rq->cpu_locality[other_rq->cpu] > LOCALITY_SMP) ++ break; ++ if (entries <= best_entries) ++ continue; ++ } else if (!entries) ++ continue; ++ ++ /* if (i) implies other_rq != rq */ ++ if (i) { ++ /* Check for best id queued lockless first */ ++ if (other_rq->best_key >= best_key) ++ continue; ++ ++ if (unlikely(!trylock_rq(rq, other_rq))) ++ continue; ++ ++ /* Need to reevaluate entries after locking */ ++ entries = other_rq->sl->entries; ++ if (unlikely(!entries)) { ++ unlock_rq(other_rq); ++ continue; ++ } ++ } ++ ++ next = other_rq->node; ++ /* ++ * In interactive mode we check beyond the best entry on other ++ * runqueues if we can't get the best for smt or affinity ++ * reasons. ++ */ ++ while ((next = next->next[0]) != other_rq->node) { ++ struct task_struct *p; ++ u64 key = next->key; ++ ++ /* Reevaluate key after locking */ ++ if (key >= best_key) ++ break; ++ ++ p = next->value; ++ if (!smt_schedule(p, rq)) { ++ if (i && !sched_interactive) ++ break; ++ continue; ++ } ++ ++ if (sched_other_cpu(p, cpu)) { ++ if (sched_interactive || !i) ++ continue; ++ break; ++ } ++ /* Make sure affinity is ok */ ++ if (i) { ++ /* From this point on p is the best so far */ ++ if (locked) ++ unlock_rq(locked); ++ chosen = locked = other_rq; ++ } ++ best_entries = entries; ++ best_key = key; ++ edt = p; ++ break; ++ } ++ /* rq->preempting is a hint only as the state may have changed ++ * since it was set with the resched call but if we have met ++ * the condition we can break out here. */ ++ if (edt == rq->preempting) ++ break; ++ if (i && other_rq != chosen) ++ unlock_rq(other_rq); ++ } ++ ++ if (likely(edt != idle)) ++ take_task(rq, cpu, edt); ++ ++ if (locked) ++ unlock_rq(locked); ++ ++ rq->preempting = NULL; ++ ++ return edt; ++} ++#else /* CONFIG_SMP */ ++static inline struct task_struct ++*earliest_deadline_task(struct rq *rq, int cpu, struct task_struct *idle) ++{ ++ struct task_struct *edt; ++ ++ if (unlikely(!rq->sl->entries)) ++ return idle; ++ edt = rq->node->next[0]->value; ++ take_task(rq, cpu, edt); ++ return edt; ++} ++#endif /* CONFIG_SMP */ ++ ++/* ++ * Print scheduling while atomic bug: ++ */ ++static noinline void __schedule_bug(struct task_struct *prev) ++{ ++ /* Save this before calling printk(), since that will clobber it */ ++ unsigned long preempt_disable_ip = get_preempt_disable_ip(current); ++ ++ if (oops_in_progress) ++ return; ++ ++ printk(KERN_ERR "BUG: scheduling while atomic: %s/%d/0x%08x\n", ++ prev->comm, prev->pid, preempt_count()); ++ ++ debug_show_held_locks(prev); ++ print_modules(); ++ if (irqs_disabled()) ++ print_irqtrace_events(prev); ++ if (IS_ENABLED(CONFIG_DEBUG_PREEMPT) ++ && in_atomic_preempt_off()) { ++ pr_err("Preemption disabled at:"); ++ print_ip_sym(preempt_disable_ip); ++ pr_cont("\n"); ++ } ++ dump_stack(); ++ add_taint(TAINT_WARN, LOCKDEP_STILL_OK); ++} ++ ++/* ++ * Various schedule()-time debugging checks and statistics: ++ */ ++static inline void schedule_debug(struct task_struct *prev, bool preempt) ++{ ++#ifdef CONFIG_SCHED_STACK_END_CHECK ++ if (task_stack_end_corrupted(prev)) ++ panic("corrupted stack end detected inside scheduler\n"); ++#endif ++ ++#ifdef CONFIG_DEBUG_ATOMIC_SLEEP ++ if (!preempt && prev->state && prev->non_block_count) { ++ printk(KERN_ERR "BUG: scheduling in a non-blocking section: %s/%d/%i\n", ++ prev->comm, prev->pid, prev->non_block_count); ++ dump_stack(); ++ add_taint(TAINT_WARN, LOCKDEP_STILL_OK); ++ } ++#endif ++ ++ if (unlikely(in_atomic_preempt_off())) { ++ __schedule_bug(prev); ++ preempt_count_set(PREEMPT_DISABLED); ++ } ++ rcu_sleep_check(); ++ ++ profile_hit(SCHED_PROFILING, __builtin_return_address(0)); ++ ++ schedstat_inc(this_rq()->sched_count); ++} ++ ++/* ++ * The currently running task's information is all stored in rq local data ++ * which is only modified by the local CPU. ++ */ ++static inline void set_rq_task(struct rq *rq, struct task_struct *p) ++{ ++ if (p == rq->idle || p->policy == SCHED_FIFO) ++ hrexpiry_clear(rq); ++ else ++ hrexpiry_start(rq, US_TO_NS(p->time_slice)); ++ if (rq->clock - rq->last_tick > HALF_JIFFY_NS) ++ rq->dither = 0; ++ else ++ rq->dither = rq_dither(rq); ++ ++ rq->rq_deadline = p->deadline; ++ rq->rq_prio = p->prio; ++#ifdef CONFIG_SMT_NICE ++ rq->rq_mm = p->mm; ++ rq->rq_smt_bias = p->smt_bias; ++#endif ++} ++ ++#ifdef CONFIG_SMT_NICE ++static void check_no_siblings(struct rq __maybe_unused *this_rq) {} ++static void wake_no_siblings(struct rq __maybe_unused *this_rq) {} ++static void (*check_siblings)(struct rq *this_rq) = &check_no_siblings; ++static void (*wake_siblings)(struct rq *this_rq) = &wake_no_siblings; ++ ++/* Iterate over smt siblings when we've scheduled a process on cpu and decide ++ * whether they should continue running or be descheduled. */ ++static void check_smt_siblings(struct rq *this_rq) ++{ ++ int other_cpu; ++ ++ for_each_cpu(other_cpu, &this_rq->thread_mask) { ++ struct task_struct *p; ++ struct rq *rq; ++ ++ rq = cpu_rq(other_cpu); ++ if (rq_idle(rq)) ++ continue; ++ p = rq->curr; ++ if (!smt_schedule(p, this_rq)) ++ resched_curr(rq); ++ } ++} ++ ++static void wake_smt_siblings(struct rq *this_rq) ++{ ++ int other_cpu; ++ ++ for_each_cpu(other_cpu, &this_rq->thread_mask) { ++ struct rq *rq; ++ ++ rq = cpu_rq(other_cpu); ++ if (rq_idle(rq)) ++ resched_idle(rq); ++ } ++} ++#else ++static void check_siblings(struct rq __maybe_unused *this_rq) {} ++static void wake_siblings(struct rq __maybe_unused *this_rq) {} ++#endif ++ ++/* ++ * schedule() is the main scheduler function. ++ * ++ * The main means of driving the scheduler and thus entering this function are: ++ * ++ * 1. Explicit blocking: mutex, semaphore, waitqueue, etc. ++ * ++ * 2. TIF_NEED_RESCHED flag is checked on interrupt and userspace return ++ * paths. For example, see arch/x86/entry_64.S. ++ * ++ * To drive preemption between tasks, the scheduler sets the flag in timer ++ * interrupt handler scheduler_tick(). ++ * ++ * 3. Wakeups don't really cause entry into schedule(). They add a ++ * task to the run-queue and that's it. ++ * ++ * Now, if the new task added to the run-queue preempts the current ++ * task, then the wakeup sets TIF_NEED_RESCHED and schedule() gets ++ * called on the nearest possible occasion: ++ * ++ * - If the kernel is preemptible (CONFIG_PREEMPTION=y): ++ * ++ * - in syscall or exception context, at the next outmost ++ * preempt_enable(). (this might be as soon as the wake_up()'s ++ * spin_unlock()!) ++ * ++ * - in IRQ context, return from interrupt-handler to ++ * preemptible context ++ * ++ * - If the kernel is not preemptible (CONFIG_PREEMPTION is not set) ++ * then at the next: ++ * ++ * - cond_resched() call ++ * - explicit schedule() call ++ * - return from syscall or exception to user-space ++ * - return from interrupt-handler to user-space ++ * ++ * WARNING: must be called with preemption disabled! ++ */ ++static void __sched notrace __schedule(bool preempt) ++{ ++ struct task_struct *prev, *next, *idle; ++ unsigned long *switch_count; ++ bool deactivate = false; ++ struct rq *rq; ++ u64 niffies; ++ int cpu; ++ ++ cpu = smp_processor_id(); ++ rq = cpu_rq(cpu); ++ prev = rq->curr; ++ idle = rq->idle; ++ ++ schedule_debug(prev, preempt); ++ ++ local_irq_disable(); ++ rcu_note_context_switch(preempt); ++ ++ /* ++ * Make sure that signal_pending_state()->signal_pending() below ++ * can't be reordered with __set_current_state(TASK_INTERRUPTIBLE) ++ * done by the caller to avoid the race with signal_wake_up(). ++ * ++ * The membarrier system call requires a full memory barrier ++ * after coming from user-space, before storing to rq->curr. ++ */ ++ rq_lock(rq); ++ smp_mb__after_spinlock(); ++#ifdef CONFIG_SMP ++ if (rq->preempt) { ++ /* ++ * Make sure resched_curr hasn't triggered a preemption ++ * locklessly on a task that has since scheduled away. Spurious ++ * wakeup of idle is okay though. ++ */ ++ if (unlikely(preempt && prev != idle && !test_tsk_need_resched(prev))) { ++ rq->preempt = NULL; ++ clear_preempt_need_resched(); ++ rq_unlock_irq(rq, NULL); ++ return; ++ } ++ rq->preempt = NULL; ++ } ++#endif ++ ++ switch_count = &prev->nivcsw; ++ if (!preempt && prev->state) { ++ if (signal_pending_state(prev->state, prev)) { ++ prev->state = TASK_RUNNING; ++ } else { ++ deactivate = true; ++ ++ if (prev->in_iowait) { ++ atomic_inc(&rq->nr_iowait); ++ delayacct_blkio_start(); ++ } ++ } ++ switch_count = &prev->nvcsw; ++ } ++ ++ /* ++ * Store the niffy value here for use by the next task's last_ran ++ * below to avoid losing niffies due to update_clocks being called ++ * again after this point. ++ */ ++ update_clocks(rq); ++ niffies = rq->niffies; ++ update_cpu_clock_switch(rq, prev); ++ ++ clear_tsk_need_resched(prev); ++ clear_preempt_need_resched(); ++ ++ if (idle != prev) { ++ check_deadline(prev, rq); ++ return_task(prev, rq, cpu, deactivate); ++ } ++ ++ next = earliest_deadline_task(rq, cpu, idle); ++ if (likely(next->prio != PRIO_LIMIT)) ++ clear_cpuidle_map(cpu); ++ else { ++ set_cpuidle_map(cpu); ++ update_load_avg(rq, 0); ++ } ++ ++ set_rq_task(rq, next); ++ next->last_ran = niffies; ++ ++ if (likely(prev != next)) { ++ /* ++ * Don't reschedule an idle task or deactivated tasks ++ */ ++ if (prev == idle) { ++ rq->nr_running++; ++ if (rt_task(next)) ++ rq->rt_nr_running++; ++ } else if (!deactivate) ++ resched_suitable_idle(prev); ++ if (unlikely(next == idle)) { ++ rq->nr_running--; ++ if (rt_task(prev)) ++ rq->rt_nr_running--; ++ wake_siblings(rq); ++ } else ++ check_siblings(rq); ++ rq->nr_switches++; ++ /* ++ * RCU users of rcu_dereference(rq->curr) may not see ++ * changes to task_struct made by pick_next_task(). ++ */ ++ RCU_INIT_POINTER(rq->curr, next); ++ /* ++ * The membarrier system call requires each architecture ++ * to have a full memory barrier after updating ++ * rq->curr, before returning to user-space. ++ * ++ * Here are the schemes providing that barrier on the ++ * various architectures: ++ * - mm ? switch_mm() : mmdrop() for x86, s390, sparc, PowerPC. ++ * switch_mm() rely on membarrier_arch_switch_mm() on PowerPC. ++ * - finish_lock_switch() for weakly-ordered ++ * architectures where spin_unlock is a full barrier, ++ * - switch_to() for arm64 (weakly-ordered, spin_unlock ++ * is a RELEASE barrier), ++ */ ++ ++*switch_count; ++ ++ trace_sched_switch(preempt, prev, next); ++ context_switch(rq, prev, next); /* unlocks the rq */ ++ } else { ++ check_siblings(rq); ++ rq_unlock(rq); ++ do_pending_softirq(rq, next); ++ local_irq_enable(); ++ } ++} ++ ++void __noreturn do_task_dead(void) ++{ ++ /* Causes final put_task_struct in finish_task_switch(). */ ++ set_special_state(TASK_DEAD); ++ ++ /* Tell freezer to ignore us: */ ++ current->flags |= PF_NOFREEZE; ++ __schedule(false); ++ BUG(); ++ ++ /* Avoid "noreturn function does return" - but don't continue if BUG() is a NOP: */ ++ for (;;) ++ cpu_relax(); ++} ++ ++static inline void sched_submit_work(struct task_struct *tsk) ++{ ++ if (!tsk->state) ++ return; ++ ++ /* ++ * If a worker went to sleep, notify and ask workqueue whether ++ * it wants to wake up a task to maintain concurrency. ++ * As this function is called inside the schedule() context, ++ * we disable preemption to avoid it calling schedule() again ++ * in the possible wakeup of a kworker. ++ */ ++ if (tsk->flags & PF_WQ_WORKER) { ++ preempt_disable(); ++ wq_worker_sleeping(tsk); ++ preempt_enable_no_resched(); ++ } ++ ++ if (tsk_is_pi_blocked(tsk)) ++ return; ++ ++ /* ++ * If we are going to sleep and we have plugged IO queued, ++ * make sure to submit it to avoid deadlocks. ++ */ ++ if (blk_needs_flush_plug(tsk)) ++ blk_schedule_flush_plug(tsk); ++} ++ ++static inline void sched_update_worker(struct task_struct *tsk) ++{ ++ if (tsk->flags & PF_WQ_WORKER) ++ wq_worker_running(tsk); ++} ++ ++asmlinkage __visible void __sched schedule(void) ++{ ++ struct task_struct *tsk = current; ++ ++ sched_submit_work(tsk); ++ do { ++ preempt_disable(); ++ __schedule(false); ++ sched_preempt_enable_no_resched(); ++ } while (need_resched()); ++ sched_update_worker(tsk); ++} ++ ++EXPORT_SYMBOL(schedule); ++ ++/* ++ * synchronize_rcu_tasks() makes sure that no task is stuck in preempted ++ * state (have scheduled out non-voluntarily) by making sure that all ++ * tasks have either left the run queue or have gone into user space. ++ * As idle tasks do not do either, they must not ever be preempted ++ * (schedule out non-voluntarily). ++ * ++ * schedule_idle() is similar to schedule_preempt_disable() except that it ++ * never enables preemption because it does not call sched_submit_work(). ++ */ ++void __sched schedule_idle(void) ++{ ++ /* ++ * As this skips calling sched_submit_work(), which the idle task does ++ * regardless because that function is a nop when the task is in a ++ * TASK_RUNNING state, make sure this isn't used someplace that the ++ * current task can be in any other state. Note, idle is always in the ++ * TASK_RUNNING state. ++ */ ++ WARN_ON_ONCE(current->state); ++ do { ++ __schedule(false); ++ } while (need_resched()); ++} ++ ++#ifdef CONFIG_CONTEXT_TRACKING ++asmlinkage __visible void __sched schedule_user(void) ++{ ++ /* ++ * If we come here after a random call to set_need_resched(), ++ * or we have been woken up remotely but the IPI has not yet arrived, ++ * we haven't yet exited the RCU idle mode. Do it here manually until ++ * we find a better solution. ++ * ++ * NB: There are buggy callers of this function. Ideally we ++ * should warn if prev_state != IN_USER, but that will trigger ++ * too frequently to make sense yet. ++ */ ++ enum ctx_state prev_state = exception_enter(); ++ schedule(); ++ exception_exit(prev_state); ++} ++#endif ++ ++/** ++ * schedule_preempt_disabled - called with preemption disabled ++ * ++ * Returns with preemption disabled. Note: preempt_count must be 1 ++ */ ++void __sched schedule_preempt_disabled(void) ++{ ++ sched_preempt_enable_no_resched(); ++ schedule(); ++ preempt_disable(); ++} ++ ++static void __sched notrace preempt_schedule_common(void) ++{ ++ do { ++ /* ++ * Because the function tracer can trace preempt_count_sub() ++ * and it also uses preempt_enable/disable_notrace(), if ++ * NEED_RESCHED is set, the preempt_enable_notrace() called ++ * by the function tracer will call this function again and ++ * cause infinite recursion. ++ * ++ * Preemption must be disabled here before the function ++ * tracer can trace. Break up preempt_disable() into two ++ * calls. One to disable preemption without fear of being ++ * traced. The other to still record the preemption latency, ++ * which can also be traced by the function tracer. ++ */ ++ preempt_disable_notrace(); ++ preempt_latency_start(1); ++ __schedule(true); ++ preempt_latency_stop(1); ++ preempt_enable_no_resched_notrace(); ++ ++ /* ++ * Check again in case we missed a preemption opportunity ++ * between schedule and now. ++ */ ++ } while (need_resched()); ++} ++ ++#ifdef CONFIG_PREEMPTION ++/* ++ * This is the entry point to schedule() from in-kernel preemption ++ * off of preempt_enable. ++ */ ++asmlinkage __visible void __sched notrace preempt_schedule(void) ++{ ++ /* ++ * If there is a non-zero preempt_count or interrupts are disabled, ++ * we do not want to preempt the current task. Just return.. ++ */ ++ if (likely(!preemptible())) ++ return; ++ ++ preempt_schedule_common(); ++} ++NOKPROBE_SYMBOL(preempt_schedule); ++EXPORT_SYMBOL(preempt_schedule); ++ ++/** ++ * preempt_schedule_notrace - preempt_schedule called by tracing ++ * ++ * The tracing infrastructure uses preempt_enable_notrace to prevent ++ * recursion and tracing preempt enabling caused by the tracing ++ * infrastructure itself. But as tracing can happen in areas coming ++ * from userspace or just about to enter userspace, a preempt enable ++ * can occur before user_exit() is called. This will cause the scheduler ++ * to be called when the system is still in usermode. ++ * ++ * To prevent this, the preempt_enable_notrace will use this function ++ * instead of preempt_schedule() to exit user context if needed before ++ * calling the scheduler. ++ */ ++asmlinkage __visible void __sched notrace preempt_schedule_notrace(void) ++{ ++ enum ctx_state prev_ctx; ++ ++ if (likely(!preemptible())) ++ return; ++ ++ do { ++ /* ++ * Because the function tracer can trace preempt_count_sub() ++ * and it also uses preempt_enable/disable_notrace(), if ++ * NEED_RESCHED is set, the preempt_enable_notrace() called ++ * by the function tracer will call this function again and ++ * cause infinite recursion. ++ * ++ * Preemption must be disabled here before the function ++ * tracer can trace. Break up preempt_disable() into two ++ * calls. One to disable preemption without fear of being ++ * traced. The other to still record the preemption latency, ++ * which can also be traced by the function tracer. ++ */ ++ preempt_disable_notrace(); ++ preempt_latency_start(1); ++ /* ++ * Needs preempt disabled in case user_exit() is traced ++ * and the tracer calls preempt_enable_notrace() causing ++ * an infinite recursion. ++ */ ++ prev_ctx = exception_enter(); ++ __schedule(true); ++ exception_exit(prev_ctx); ++ ++ preempt_latency_stop(1); ++ preempt_enable_no_resched_notrace(); ++ } while (need_resched()); ++} ++EXPORT_SYMBOL_GPL(preempt_schedule_notrace); ++ ++#endif /* CONFIG_PREEMPTION */ ++ ++/* ++ * This is the entry point to schedule() from kernel preemption ++ * off of irq context. ++ * Note, that this is called and return with irqs disabled. This will ++ * protect us against recursive calling from irq. ++ */ ++asmlinkage __visible void __sched preempt_schedule_irq(void) ++{ ++ enum ctx_state prev_state; ++ ++ /* Catch callers which need to be fixed */ ++ BUG_ON(preempt_count() || !irqs_disabled()); ++ ++ prev_state = exception_enter(); ++ ++ do { ++ preempt_disable(); ++ local_irq_enable(); ++ __schedule(true); ++ local_irq_disable(); ++ sched_preempt_enable_no_resched(); ++ } while (need_resched()); ++ ++ exception_exit(prev_state); ++} ++ ++int default_wake_function(wait_queue_entry_t *curr, unsigned mode, int wake_flags, ++ void *key) ++{ ++ return try_to_wake_up(curr->private, mode, wake_flags); ++} ++EXPORT_SYMBOL(default_wake_function); ++ ++#ifdef CONFIG_RT_MUTEXES ++ ++static inline int __rt_effective_prio(struct task_struct *pi_task, int prio) ++{ ++ if (pi_task) ++ prio = min(prio, pi_task->prio); ++ ++ return prio; ++} ++ ++static inline int rt_effective_prio(struct task_struct *p, int prio) ++{ ++ struct task_struct *pi_task = rt_mutex_get_top_task(p); ++ ++ return __rt_effective_prio(pi_task, prio); ++} ++ ++/* ++ * rt_mutex_setprio - set the current priority of a task ++ * @p: task to boost ++ * @pi_task: donor task ++ * ++ * This function changes the 'effective' priority of a task. It does ++ * not touch ->normal_prio like __setscheduler(). ++ * ++ * Used by the rt_mutex code to implement priority inheritance ++ * logic. Call site only calls if the priority of the task changed. ++ */ ++void rt_mutex_setprio(struct task_struct *p, struct task_struct *pi_task) ++{ ++ int prio, oldprio; ++ struct rq *rq; ++ ++ /* XXX used to be waiter->prio, not waiter->task->prio */ ++ prio = __rt_effective_prio(pi_task, p->normal_prio); ++ ++ /* ++ * If nothing changed; bail early. ++ */ ++ if (p->pi_top_task == pi_task && prio == p->prio) ++ return; ++ ++ rq = __task_rq_lock(p, NULL); ++ update_rq_clock(rq); ++ /* ++ * Set under pi_lock && rq->lock, such that the value can be used under ++ * either lock. ++ * ++ * Note that there is loads of tricky to make this pointer cache work ++ * right. rt_mutex_slowunlock()+rt_mutex_postunlock() work together to ++ * ensure a task is de-boosted (pi_task is set to NULL) before the ++ * task is allowed to run again (and can exit). This ensures the pointer ++ * points to a blocked task -- which guaratees the task is present. ++ */ ++ p->pi_top_task = pi_task; ++ ++ /* ++ * For FIFO/RR we only need to set prio, if that matches we're done. ++ */ ++ if (prio == p->prio) ++ goto out_unlock; ++ ++ /* ++ * Idle task boosting is a nono in general. There is one ++ * exception, when PREEMPT_RT and NOHZ is active: ++ * ++ * The idle task calls get_next_timer_interrupt() and holds ++ * the timer wheel base->lock on the CPU and another CPU wants ++ * to access the timer (probably to cancel it). We can safely ++ * ignore the boosting request, as the idle CPU runs this code ++ * with interrupts disabled and will complete the lock ++ * protected section without being interrupted. So there is no ++ * real need to boost. ++ */ ++ if (unlikely(p == rq->idle)) { ++ WARN_ON(p != rq->curr); ++ WARN_ON(p->pi_blocked_on); ++ goto out_unlock; ++ } ++ ++ trace_sched_pi_setprio(p, pi_task); ++ oldprio = p->prio; ++ p->prio = prio; ++ if (task_running(rq, p)){ ++ if (prio > oldprio) ++ resched_task(p); ++ } else if (task_queued(p)) { ++ dequeue_task(rq, p, DEQUEUE_SAVE); ++ enqueue_task(rq, p, ENQUEUE_RESTORE); ++ if (prio < oldprio) ++ try_preempt(p, rq); ++ } ++out_unlock: ++ __task_rq_unlock(rq, NULL); ++} ++#else ++static inline int rt_effective_prio(struct task_struct *p, int prio) ++{ ++ return prio; ++} ++#endif ++ ++/* ++ * Adjust the deadline for when the priority is to change, before it's ++ * changed. ++ */ ++static inline void adjust_deadline(struct task_struct *p, int new_prio) ++{ ++ p->deadline += static_deadline_diff(new_prio) - task_deadline_diff(p); ++} ++ ++void set_user_nice(struct task_struct *p, long nice) ++{ ++ int new_static, old_static; ++ struct rq_flags rf; ++ struct rq *rq; ++ ++ if (task_nice(p) == nice || nice < MIN_NICE || nice > MAX_NICE) ++ return; ++ new_static = NICE_TO_PRIO(nice); ++ /* ++ * We have to be careful, if called from sys_setpriority(), ++ * the task might be in the middle of scheduling on another CPU. ++ */ ++ rq = task_rq_lock(p, &rf); ++ update_rq_clock(rq); ++ ++ /* ++ * The RT priorities are set via sched_setscheduler(), but we still ++ * allow the 'normal' nice value to be set - but as expected ++ * it wont have any effect on scheduling until the task is ++ * not SCHED_NORMAL/SCHED_BATCH: ++ */ ++ if (has_rt_policy(p)) { ++ p->static_prio = new_static; ++ goto out_unlock; ++ } ++ ++ adjust_deadline(p, new_static); ++ old_static = p->static_prio; ++ p->static_prio = new_static; ++ p->prio = effective_prio(p); ++ ++ if (task_queued(p)) { ++ dequeue_task(rq, p, DEQUEUE_SAVE); ++ enqueue_task(rq, p, ENQUEUE_RESTORE); ++ if (new_static < old_static) ++ try_preempt(p, rq); ++ } else if (task_running(rq, p)) { ++ set_rq_task(rq, p); ++ if (old_static < new_static) ++ resched_task(p); ++ } ++out_unlock: ++ task_rq_unlock(rq, p, &rf); ++} ++EXPORT_SYMBOL(set_user_nice); ++ ++/* ++ * can_nice - check if a task can reduce its nice value ++ * @p: task ++ * @nice: nice value ++ */ ++int can_nice(const struct task_struct *p, const int nice) ++{ ++ /* Convert nice value [19,-20] to rlimit style value [1,40] */ ++ int nice_rlim = nice_to_rlimit(nice); ++ ++ return (nice_rlim <= task_rlimit(p, RLIMIT_NICE) || ++ capable(CAP_SYS_NICE)); ++} ++ ++#ifdef __ARCH_WANT_SYS_NICE ++ ++/* ++ * sys_nice - change the priority of the current process. ++ * @increment: priority increment ++ * ++ * sys_setpriority is a more generic, but much slower function that ++ * does similar things. ++ */ ++SYSCALL_DEFINE1(nice, int, increment) ++{ ++ long nice, retval; ++ ++ /* ++ * Setpriority might change our priority at the same moment. ++ * We don't have to worry. Conceptually one call occurs first ++ * and we have a single winner. ++ */ ++ ++ increment = clamp(increment, -NICE_WIDTH, NICE_WIDTH); ++ nice = task_nice(current) + increment; ++ ++ nice = clamp_val(nice, MIN_NICE, MAX_NICE); ++ if (increment < 0 && !can_nice(current, nice)) ++ return -EPERM; ++ ++ retval = security_task_setnice(current, nice); ++ if (retval) ++ return retval; ++ ++ set_user_nice(current, nice); ++ return 0; ++} ++ ++#endif ++ ++/** ++ * task_prio - return the priority value of a given task. ++ * @p: the task in question. ++ * ++ * Return: The priority value as seen by users in /proc. ++ * RT tasks are offset by -100. Normal tasks are centered around 1, value goes ++ * from 0 (SCHED_ISO) up to 82 (nice +19 SCHED_IDLEPRIO). ++ */ ++int task_prio(const struct task_struct *p) ++{ ++ int delta, prio = p->prio - MAX_RT_PRIO; ++ ++ /* rt tasks and iso tasks */ ++ if (prio <= 0) ++ goto out; ++ ++ /* Convert to ms to avoid overflows */ ++ delta = NS_TO_MS(p->deadline - task_rq(p)->niffies); ++ if (unlikely(delta < 0)) ++ delta = 0; ++ delta = delta * 40 / ms_longest_deadline_diff(); ++ if (delta <= 80) ++ prio += delta; ++ if (idleprio_task(p)) ++ prio += 40; ++out: ++ return prio; ++} ++ ++/** ++ * idle_cpu - is a given CPU idle currently? ++ * @cpu: the processor in question. ++ * ++ * Return: 1 if the CPU is currently idle. 0 otherwise. ++ */ ++int idle_cpu(int cpu) ++{ ++ return cpu_curr(cpu) == cpu_rq(cpu)->idle; ++} ++ ++/** ++ * available_idle_cpu - is a given CPU idle for enqueuing work. ++ * @cpu: the CPU in question. ++ * ++ * Return: 1 if the CPU is currently idle. 0 otherwise. ++ */ ++int available_idle_cpu(int cpu) ++{ ++ if (!idle_cpu(cpu)) ++ return 0; ++ ++ if (vcpu_is_preempted(cpu)) ++ return 0; ++ ++ return 1; ++} ++ ++/** ++ * idle_task - return the idle task for a given CPU. ++ * @cpu: the processor in question. ++ * ++ * Return: The idle task for the CPU @cpu. ++ */ ++struct task_struct *idle_task(int cpu) ++{ ++ return cpu_rq(cpu)->idle; ++} ++ ++/** ++ * find_process_by_pid - find a process with a matching PID value. ++ * @pid: the pid in question. ++ * ++ * The task of @pid, if found. %NULL otherwise. ++ */ ++static inline struct task_struct *find_process_by_pid(pid_t pid) ++{ ++ return pid ? find_task_by_vpid(pid) : current; ++} ++ ++/* Actually do priority change: must hold rq lock. */ ++static void __setscheduler(struct task_struct *p, struct rq *rq, int policy, ++ int prio, const struct sched_attr *attr, ++ bool keep_boost) ++{ ++ int oldrtprio, oldprio; ++ ++ /* ++ * If params can't change scheduling class changes aren't allowed ++ * either. ++ */ ++ if (attr->sched_flags & SCHED_FLAG_KEEP_PARAMS) ++ return; ++ ++ p->policy = policy; ++ oldrtprio = p->rt_priority; ++ p->rt_priority = prio; ++ p->normal_prio = normal_prio(p); ++ oldprio = p->prio; ++ /* ++ * Keep a potential priority boosting if called from ++ * sched_setscheduler(). ++ */ ++ p->prio = normal_prio(p); ++ if (keep_boost) ++ p->prio = rt_effective_prio(p, p->prio); ++ ++ if (task_running(rq, p)) { ++ set_rq_task(rq, p); ++ resched_task(p); ++ } else if (task_queued(p)) { ++ dequeue_task(rq, p, DEQUEUE_SAVE); ++ enqueue_task(rq, p, ENQUEUE_RESTORE); ++ if (p->prio < oldprio || p->rt_priority > oldrtprio) ++ try_preempt(p, rq); ++ } ++} ++ ++/* ++ * Check the target process has a UID that matches the current process's ++ */ ++static bool check_same_owner(struct task_struct *p) ++{ ++ const struct cred *cred = current_cred(), *pcred; ++ bool match; ++ ++ rcu_read_lock(); ++ pcred = __task_cred(p); ++ match = (uid_eq(cred->euid, pcred->euid) || ++ uid_eq(cred->euid, pcred->uid)); ++ rcu_read_unlock(); ++ return match; ++} ++ ++static int __sched_setscheduler(struct task_struct *p, ++ const struct sched_attr *attr, ++ bool user, bool pi) ++{ ++ int retval, policy = attr->sched_policy, oldpolicy = -1, priority = attr->sched_priority; ++ unsigned long rlim_rtprio = 0; ++ struct rq_flags rf; ++ int reset_on_fork; ++ struct rq *rq; ++ ++ /* The pi code expects interrupts enabled */ ++ BUG_ON(pi && in_interrupt()); ++ ++ if (is_rt_policy(policy) && !capable(CAP_SYS_NICE)) { ++ unsigned long lflags; ++ ++ if (!lock_task_sighand(p, &lflags)) ++ return -ESRCH; ++ rlim_rtprio = task_rlimit(p, RLIMIT_RTPRIO); ++ unlock_task_sighand(p, &lflags); ++ if (rlim_rtprio) ++ goto recheck; ++ /* ++ * If the caller requested an RT policy without having the ++ * necessary rights, we downgrade the policy to SCHED_ISO. ++ * We also set the parameter to zero to pass the checks. ++ */ ++ policy = SCHED_ISO; ++ priority = 0; ++ } ++recheck: ++ /* Double check policy once rq lock held */ ++ if (policy < 0) { ++ reset_on_fork = p->sched_reset_on_fork; ++ policy = oldpolicy = p->policy; ++ } else { ++ reset_on_fork = !!(policy & SCHED_RESET_ON_FORK); ++ policy &= ~SCHED_RESET_ON_FORK; ++ ++ if (!SCHED_RANGE(policy)) ++ return -EINVAL; ++ } ++ ++ if (attr->sched_flags & ~(SCHED_FLAG_ALL | SCHED_FLAG_SUGOV)) ++ return -EINVAL; ++ ++ /* ++ * Valid priorities for SCHED_FIFO and SCHED_RR are ++ * 1..MAX_USER_RT_PRIO-1, valid priority for SCHED_NORMAL and ++ * SCHED_BATCH is 0. ++ */ ++ if (priority < 0 || ++ (p->mm && priority > MAX_USER_RT_PRIO - 1) || ++ (!p->mm && priority > MAX_RT_PRIO - 1)) ++ return -EINVAL; ++ if (is_rt_policy(policy) != (priority != 0)) ++ return -EINVAL; ++ ++ /* ++ * Allow unprivileged RT tasks to decrease priority: ++ */ ++ if (user && !capable(CAP_SYS_NICE)) { ++ if (is_rt_policy(policy)) { ++ unsigned long rlim_rtprio = ++ task_rlimit(p, RLIMIT_RTPRIO); ++ ++ /* Can't set/change the rt policy */ ++ if (policy != p->policy && !rlim_rtprio) ++ return -EPERM; ++ ++ /* Can't increase priority */ ++ if (priority > p->rt_priority && ++ priority > rlim_rtprio) ++ return -EPERM; ++ } else { ++ switch (p->policy) { ++ /* ++ * Can only downgrade policies but not back to ++ * SCHED_NORMAL ++ */ ++ case SCHED_ISO: ++ if (policy == SCHED_ISO) ++ goto out; ++ if (policy != SCHED_NORMAL) ++ return -EPERM; ++ break; ++ case SCHED_BATCH: ++ if (policy == SCHED_BATCH) ++ goto out; ++ if (policy != SCHED_IDLEPRIO) ++ return -EPERM; ++ break; ++ case SCHED_IDLEPRIO: ++ if (policy == SCHED_IDLEPRIO) ++ goto out; ++ return -EPERM; ++ default: ++ break; ++ } ++ } ++ ++ /* Can't change other user's priorities */ ++ if (!check_same_owner(p)) ++ return -EPERM; ++ ++ /* Normal users shall not reset the sched_reset_on_fork flag: */ ++ if (p->sched_reset_on_fork && !reset_on_fork) ++ return -EPERM; ++ } ++ ++ if (user) { ++ retval = security_task_setscheduler(p); ++ if (retval) ++ return retval; ++ } ++ ++ if (pi) ++ cpuset_read_lock(); ++ ++ /* ++ * Make sure no PI-waiters arrive (or leave) while we are ++ * changing the priority of the task: ++ * ++ * To be able to change p->policy safely, the runqueue lock must be ++ * held. ++ */ ++ rq = task_rq_lock(p, &rf); ++ update_rq_clock(rq); ++ ++ /* ++ * Changing the policy of the stop threads its a very bad idea: ++ */ ++ if (p == rq->stop) { ++ retval = -EINVAL; ++ goto unlock; ++ } ++ ++ /* ++ * If not changing anything there's no need to proceed further: ++ */ ++ if (unlikely(policy == p->policy && (!is_rt_policy(policy) || ++ priority == p->rt_priority))) { ++ retval = 0; ++ goto unlock; ++ } ++ ++ /* Re-check policy now with rq lock held */ ++ if (unlikely(oldpolicy != -1 && oldpolicy != p->policy)) { ++ policy = oldpolicy = -1; ++ task_rq_unlock(rq, p, &rf); ++ if (pi) ++ cpuset_read_unlock(); ++ goto recheck; ++ } ++ p->sched_reset_on_fork = reset_on_fork; ++ ++ __setscheduler(p, rq, policy, priority, attr, pi); ++ ++ /* Avoid rq from going away on us: */ ++ preempt_disable(); ++ task_rq_unlock(rq, p, &rf); ++ ++ if (pi) { ++ cpuset_read_unlock(); ++ rt_mutex_adjust_pi(p); ++ } ++ preempt_enable(); ++out: ++ return 0; ++ ++unlock: ++ task_rq_unlock(rq, p, &rf); ++ if (pi) ++ cpuset_read_unlock(); ++ return retval; ++} ++ ++static int _sched_setscheduler(struct task_struct *p, int policy, ++ const struct sched_param *param, bool check) ++{ ++ struct sched_attr attr = { ++ .sched_policy = policy, ++ .sched_priority = param->sched_priority, ++ .sched_nice = PRIO_TO_NICE(p->static_prio), ++ }; ++ ++ return __sched_setscheduler(p, &attr, check, true); ++} ++/** ++ * sched_setscheduler - change the scheduling policy and/or RT priority of a thread. ++ * @p: the task in question. ++ * @policy: new policy. ++ * @param: structure containing the new RT priority. ++ * ++ * Return: 0 on success. An error code otherwise. ++ * ++ * NOTE that the task may be already dead. ++ */ ++int sched_setscheduler(struct task_struct *p, int policy, ++ const struct sched_param *param) ++{ ++ return _sched_setscheduler(p, policy, param, true); ++} ++ ++EXPORT_SYMBOL_GPL(sched_setscheduler); ++ ++int sched_setattr(struct task_struct *p, const struct sched_attr *attr) ++{ ++ return __sched_setscheduler(p, attr, true, true); ++} ++EXPORT_SYMBOL_GPL(sched_setattr); ++ ++int sched_setattr_nocheck(struct task_struct *p, const struct sched_attr *attr) ++{ ++ return __sched_setscheduler(p, attr, false, true); ++} ++ ++/** ++ * sched_setscheduler_nocheck - change the scheduling policy and/or RT priority of a thread from kernelspace. ++ * @p: the task in question. ++ * @policy: new policy. ++ * @param: structure containing the new RT priority. ++ * ++ * Just like sched_setscheduler, only don't bother checking if the ++ * current context has permission. For example, this is needed in ++ * stop_machine(): we create temporary high priority worker threads, ++ * but our caller might not have that capability. ++ * ++ * Return: 0 on success. An error code otherwise. ++ */ ++int sched_setscheduler_nocheck(struct task_struct *p, int policy, ++ const struct sched_param *param) ++{ ++ return _sched_setscheduler(p, policy, param, false); ++} ++EXPORT_SYMBOL_GPL(sched_setscheduler_nocheck); ++ ++static int ++do_sched_setscheduler(pid_t pid, int policy, struct sched_param __user *param) ++{ ++ struct sched_param lparam; ++ struct task_struct *p; ++ int retval; ++ ++ if (!param || pid < 0) ++ return -EINVAL; ++ if (copy_from_user(&lparam, param, sizeof(struct sched_param))) ++ return -EFAULT; ++ ++ rcu_read_lock(); ++ retval = -ESRCH; ++ p = find_process_by_pid(pid); ++ if (likely(p)) ++ get_task_struct(p); ++ rcu_read_unlock(); ++ ++ if (likely(p)) { ++ retval = sched_setscheduler(p, policy, &lparam); ++ put_task_struct(p); ++ } ++ ++ return retval; ++} ++ ++/* ++ * Mimics kernel/events/core.c perf_copy_attr(). ++ */ ++static int sched_copy_attr(struct sched_attr __user *uattr, ++ struct sched_attr *attr) ++{ ++ u32 size; ++ int ret; ++ ++ /* Zero the full structure, so that a short copy will be nice: */ ++ memset(attr, 0, sizeof(*attr)); ++ ++ ret = get_user(size, &uattr->size); ++ if (ret) ++ return ret; ++ ++ /* ABI compatibility quirk: */ ++ if (!size) ++ size = SCHED_ATTR_SIZE_VER0; ++ ++ if (size < SCHED_ATTR_SIZE_VER0 || size > PAGE_SIZE) ++ goto err_size; ++ ++ ret = copy_struct_from_user(attr, sizeof(*attr), uattr, size); ++ if (ret) { ++ if (ret == -E2BIG) ++ goto err_size; ++ return ret; ++ } ++ ++ /* ++ * XXX: Do we want to be lenient like existing syscalls; or do we want ++ * to be strict and return an error on out-of-bounds values? ++ */ ++ attr->sched_nice = clamp(attr->sched_nice, -20, 19); ++ ++ /* sched/core.c uses zero here but we already know ret is zero */ ++ return 0; ++ ++err_size: ++ put_user(sizeof(*attr), &uattr->size); ++ return -E2BIG; ++} ++ ++/* ++ * sched_setparam() passes in -1 for its policy, to let the functions ++ * it calls know not to change it. ++ */ ++#define SETPARAM_POLICY -1 ++ ++/** ++ * sys_sched_setscheduler - set/change the scheduler policy and RT priority ++ * @pid: the pid in question. ++ * @policy: new policy. ++ * @param: structure containing the new RT priority. ++ * ++ * Return: 0 on success. An error code otherwise. ++ */ ++SYSCALL_DEFINE3(sched_setscheduler, pid_t, pid, int, policy, struct sched_param __user *, param) ++{ ++ if (policy < 0) ++ return -EINVAL; ++ ++ return do_sched_setscheduler(pid, policy, param); ++} ++ ++/** ++ * sys_sched_setparam - set/change the RT priority of a thread ++ * @pid: the pid in question. ++ * @param: structure containing the new RT priority. ++ * ++ * Return: 0 on success. An error code otherwise. ++ */ ++SYSCALL_DEFINE2(sched_setparam, pid_t, pid, struct sched_param __user *, param) ++{ ++ return do_sched_setscheduler(pid, SETPARAM_POLICY, param); ++} ++ ++/** ++ * sys_sched_setattr - same as above, but with extended sched_attr ++ * @pid: the pid in question. ++ * @uattr: structure containing the extended parameters. ++ */ ++SYSCALL_DEFINE3(sched_setattr, pid_t, pid, struct sched_attr __user *, uattr, ++ unsigned int, flags) ++{ ++ struct sched_attr attr; ++ struct task_struct *p; ++ int retval; ++ ++ if (!uattr || pid < 0 || flags) ++ return -EINVAL; ++ ++ retval = sched_copy_attr(uattr, &attr); ++ if (retval) ++ return retval; ++ ++ if ((int)attr.sched_policy < 0) ++ return -EINVAL; ++ if (attr.sched_flags & SCHED_FLAG_KEEP_POLICY) ++ attr.sched_policy = SETPARAM_POLICY; ++ ++ rcu_read_lock(); ++ retval = -ESRCH; ++ p = find_process_by_pid(pid); ++ if (likely(p)) ++ get_task_struct(p); ++ rcu_read_unlock(); ++ ++ if (likely(p)) { ++ retval = sched_setattr(p, &attr); ++ put_task_struct(p); ++ } ++ ++ return retval; ++} ++ ++/** ++ * sys_sched_getscheduler - get the policy (scheduling class) of a thread ++ * @pid: the pid in question. ++ * ++ * Return: On success, the policy of the thread. Otherwise, a negative error ++ * code. ++ */ ++SYSCALL_DEFINE1(sched_getscheduler, pid_t, pid) ++{ ++ struct task_struct *p; ++ int retval = -EINVAL; ++ ++ if (pid < 0) ++ goto out_nounlock; ++ ++ retval = -ESRCH; ++ rcu_read_lock(); ++ p = find_process_by_pid(pid); ++ if (p) { ++ retval = security_task_getscheduler(p); ++ if (!retval) ++ retval = p->policy; ++ } ++ rcu_read_unlock(); ++ ++out_nounlock: ++ return retval; ++} ++ ++/** ++ * sys_sched_getscheduler - get the RT priority of a thread ++ * @pid: the pid in question. ++ * @param: structure containing the RT priority. ++ * ++ * Return: On success, 0 and the RT priority is in @param. Otherwise, an error ++ * code. ++ */ ++SYSCALL_DEFINE2(sched_getparam, pid_t, pid, struct sched_param __user *, param) ++{ ++ struct sched_param lp = { .sched_priority = 0 }; ++ struct task_struct *p; ++ int retval = -EINVAL; ++ ++ if (!param || pid < 0) ++ goto out_nounlock; ++ ++ rcu_read_lock(); ++ p = find_process_by_pid(pid); ++ retval = -ESRCH; ++ if (!p) ++ goto out_unlock; ++ ++ retval = security_task_getscheduler(p); ++ if (retval) ++ goto out_unlock; ++ ++ if (has_rt_policy(p)) ++ lp.sched_priority = p->rt_priority; ++ rcu_read_unlock(); ++ ++ /* ++ * This one might sleep, we cannot do it with a spinlock held ... ++ */ ++ retval = copy_to_user(param, &lp, sizeof(*param)) ? -EFAULT : 0; ++ ++out_nounlock: ++ return retval; ++ ++out_unlock: ++ rcu_read_unlock(); ++ return retval; ++} ++ ++/* ++ * Copy the kernel size attribute structure (which might be larger ++ * than what user-space knows about) to user-space. ++ * ++ * Note that all cases are valid: user-space buffer can be larger or ++ * smaller than the kernel-space buffer. The usual case is that both ++ * have the same size. ++ */ ++static int ++sched_attr_copy_to_user(struct sched_attr __user *uattr, ++ struct sched_attr *kattr, ++ unsigned int usize) ++{ ++ unsigned int ksize = sizeof(*kattr); ++ ++ if (!access_ok(uattr, usize)) ++ return -EFAULT; ++ ++ /* ++ * sched_getattr() ABI forwards and backwards compatibility: ++ * ++ * If usize == ksize then we just copy everything to user-space and all is good. ++ * ++ * If usize < ksize then we only copy as much as user-space has space for, ++ * this keeps ABI compatibility as well. We skip the rest. ++ * ++ * If usize > ksize then user-space is using a newer version of the ABI, ++ * which part the kernel doesn't know about. Just ignore it - tooling can ++ * detect the kernel's knowledge of attributes from the attr->size value ++ * which is set to ksize in this case. ++ */ ++ kattr->size = min(usize, ksize); ++ ++ if (copy_to_user(uattr, kattr, kattr->size)) ++ return -EFAULT; ++ ++ return 0; ++} ++ ++/** ++ * sys_sched_getattr - similar to sched_getparam, but with sched_attr ++ * @pid: the pid in question. ++ * @uattr: structure containing the extended parameters. ++ * @usize: sizeof(attr) for fwd/bwd comp. ++ * @flags: for future extension. ++ */ ++SYSCALL_DEFINE4(sched_getattr, pid_t, pid, struct sched_attr __user *, uattr, ++ unsigned int, usize, unsigned int, flags) ++{ ++ struct sched_attr kattr = { }; ++ struct task_struct *p; ++ int retval; ++ ++ if (!uattr || pid < 0 || usize > PAGE_SIZE || ++ usize < SCHED_ATTR_SIZE_VER0 || flags) ++ return -EINVAL; ++ ++ rcu_read_lock(); ++ p = find_process_by_pid(pid); ++ retval = -ESRCH; ++ if (!p) ++ goto out_unlock; ++ ++ retval = security_task_getscheduler(p); ++ if (retval) ++ goto out_unlock; ++ ++ kattr.sched_policy = p->policy; ++ if (rt_task(p)) ++ kattr.sched_priority = p->rt_priority; ++ else ++ kattr.sched_nice = task_nice(p); ++ ++ rcu_read_unlock(); ++ ++ return sched_attr_copy_to_user(uattr, &kattr, usize); ++ ++out_unlock: ++ rcu_read_unlock(); ++ return retval; ++} ++ ++long sched_setaffinity(pid_t pid, const struct cpumask *in_mask) ++{ ++ cpumask_var_t cpus_allowed, new_mask; ++ struct task_struct *p; ++ int retval; ++ ++ rcu_read_lock(); ++ ++ p = find_process_by_pid(pid); ++ if (!p) { ++ rcu_read_unlock(); ++ return -ESRCH; ++ } ++ ++ /* Prevent p going away */ ++ get_task_struct(p); ++ rcu_read_unlock(); ++ ++ if (p->flags & PF_NO_SETAFFINITY) { ++ retval = -EINVAL; ++ goto out_put_task; ++ } ++ if (!alloc_cpumask_var(&cpus_allowed, GFP_KERNEL)) { ++ retval = -ENOMEM; ++ goto out_put_task; ++ } ++ if (!alloc_cpumask_var(&new_mask, GFP_KERNEL)) { ++ retval = -ENOMEM; ++ goto out_free_cpus_allowed; ++ } ++ retval = -EPERM; ++ if (!check_same_owner(p)) { ++ rcu_read_lock(); ++ if (!ns_capable(__task_cred(p)->user_ns, CAP_SYS_NICE)) { ++ rcu_read_unlock(); ++ goto out_unlock; ++ } ++ rcu_read_unlock(); ++ } ++ ++ retval = security_task_setscheduler(p); ++ if (retval) ++ goto out_unlock; ++ ++ cpuset_cpus_allowed(p, cpus_allowed); ++ cpumask_and(new_mask, in_mask, cpus_allowed); ++again: ++ retval = __set_cpus_allowed_ptr(p, new_mask, true); ++ ++ if (!retval) { ++ cpuset_cpus_allowed(p, cpus_allowed); ++ if (!cpumask_subset(new_mask, cpus_allowed)) { ++ /* ++ * We must have raced with a concurrent cpuset ++ * update. Just reset the cpus_allowed to the ++ * cpuset's cpus_allowed ++ */ ++ cpumask_copy(new_mask, cpus_allowed); ++ goto again; ++ } ++ } ++out_unlock: ++ free_cpumask_var(new_mask); ++out_free_cpus_allowed: ++ free_cpumask_var(cpus_allowed); ++out_put_task: ++ put_task_struct(p); ++ return retval; ++} ++ ++static int get_user_cpu_mask(unsigned long __user *user_mask_ptr, unsigned len, ++ cpumask_t *new_mask) ++{ ++ if (len < cpumask_size()) ++ cpumask_clear(new_mask); ++ else if (len > cpumask_size()) ++ len = cpumask_size(); ++ ++ return copy_from_user(new_mask, user_mask_ptr, len) ? -EFAULT : 0; ++} ++ ++ ++/** ++ * sys_sched_setaffinity - set the CPU affinity of a process ++ * @pid: pid of the process ++ * @len: length in bytes of the bitmask pointed to by user_mask_ptr ++ * @user_mask_ptr: user-space pointer to the new CPU mask ++ * ++ * Return: 0 on success. An error code otherwise. ++ */ ++SYSCALL_DEFINE3(sched_setaffinity, pid_t, pid, unsigned int, len, ++ unsigned long __user *, user_mask_ptr) ++{ ++ cpumask_var_t new_mask; ++ int retval; ++ ++ if (!alloc_cpumask_var(&new_mask, GFP_KERNEL)) ++ return -ENOMEM; ++ ++ retval = get_user_cpu_mask(user_mask_ptr, len, new_mask); ++ if (retval == 0) ++ retval = sched_setaffinity(pid, new_mask); ++ free_cpumask_var(new_mask); ++ return retval; ++} ++ ++long sched_getaffinity(pid_t pid, cpumask_t *mask) ++{ ++ struct task_struct *p; ++ unsigned long flags; ++ int retval; ++ ++ get_online_cpus(); ++ rcu_read_lock(); ++ ++ retval = -ESRCH; ++ p = find_process_by_pid(pid); ++ if (!p) ++ goto out_unlock; ++ ++ retval = security_task_getscheduler(p); ++ if (retval) ++ goto out_unlock; ++ ++ raw_spin_lock_irqsave(&p->pi_lock, flags); ++ cpumask_and(mask, &p->cpus_mask, cpu_active_mask); ++ raw_spin_unlock_irqrestore(&p->pi_lock, flags); ++ ++out_unlock: ++ rcu_read_unlock(); ++ put_online_cpus(); ++ ++ return retval; ++} ++ ++/** ++ * sys_sched_getaffinity - get the CPU affinity of a process ++ * @pid: pid of the process ++ * @len: length in bytes of the bitmask pointed to by user_mask_ptr ++ * @user_mask_ptr: user-space pointer to hold the current CPU mask ++ * ++ * Return: 0 on success. An error code otherwise. ++ */ ++SYSCALL_DEFINE3(sched_getaffinity, pid_t, pid, unsigned int, len, ++ unsigned long __user *, user_mask_ptr) ++{ ++ int ret; ++ cpumask_var_t mask; ++ ++ if ((len * BITS_PER_BYTE) < nr_cpu_ids) ++ return -EINVAL; ++ if (len & (sizeof(unsigned long)-1)) ++ return -EINVAL; ++ ++ if (!alloc_cpumask_var(&mask, GFP_KERNEL)) ++ return -ENOMEM; ++ ++ ret = sched_getaffinity(pid, mask); ++ if (ret == 0) { ++ unsigned int retlen = min(len, cpumask_size()); ++ ++ if (copy_to_user(user_mask_ptr, mask, retlen)) ++ ret = -EFAULT; ++ else ++ ret = retlen; ++ } ++ free_cpumask_var(mask); ++ ++ return ret; ++} ++ ++/** ++ * sys_sched_yield - yield the current processor to other threads. ++ * ++ * This function yields the current CPU to other tasks. It does this by ++ * scheduling away the current task. If it still has the earliest deadline ++ * it will be scheduled again as the next task. ++ * ++ * Return: 0. ++ */ ++static void do_sched_yield(void) ++{ ++ struct rq *rq; ++ ++ if (!sched_yield_type) ++ return; ++ ++ local_irq_disable(); ++ rq = this_rq(); ++ rq_lock(rq); ++ ++ if (sched_yield_type > 1) ++ time_slice_expired(current, rq); ++ schedstat_inc(rq->yld_count); ++ ++ /* ++ * Since we are going to call schedule() anyway, there's ++ * no need to preempt or enable interrupts: ++ */ ++ preempt_disable(); ++ rq_unlock(rq); ++ sched_preempt_enable_no_resched(); ++ ++ schedule(); ++} ++ ++SYSCALL_DEFINE0(sched_yield) ++{ ++ do_sched_yield(); ++ return 0; ++} ++ ++#ifndef CONFIG_PREEMPTION ++int __sched _cond_resched(void) ++{ ++ if (should_resched(0)) { ++ preempt_schedule_common(); ++ return 1; ++ } ++ rcu_all_qs(); ++ return 0; ++} ++EXPORT_SYMBOL(_cond_resched); ++#endif ++ ++/* ++ * __cond_resched_lock() - if a reschedule is pending, drop the given lock, ++ * call schedule, and on return reacquire the lock. ++ * ++ * This works OK both with and without CONFIG_PREEMPTION. We do strange low-level ++ * operations here to prevent schedule() from being called twice (once via ++ * spin_unlock(), once by hand). ++ */ ++int __cond_resched_lock(spinlock_t *lock) ++{ ++ int resched = should_resched(PREEMPT_LOCK_OFFSET); ++ int ret = 0; ++ ++ lockdep_assert_held(lock); ++ ++ if (spin_needbreak(lock) || resched) { ++ spin_unlock(lock); ++ if (resched) ++ preempt_schedule_common(); ++ else ++ cpu_relax(); ++ ret = 1; ++ spin_lock(lock); ++ } ++ return ret; ++} ++EXPORT_SYMBOL(__cond_resched_lock); ++ ++/** ++ * yield - yield the current processor to other threads. ++ * ++ * Do not ever use this function, there's a 99% chance you're doing it wrong. ++ * ++ * The scheduler is at all times free to pick the calling task as the most ++ * eligible task to run, if removing the yield() call from your code breaks ++ * it, its already broken. ++ * ++ * Typical broken usage is: ++ * ++ * while (!event) ++ * yield(); ++ * ++ * where one assumes that yield() will let 'the other' process run that will ++ * make event true. If the current task is a SCHED_FIFO task that will never ++ * happen. Never use yield() as a progress guarantee!! ++ * ++ * If you want to use yield() to wait for something, use wait_event(). ++ * If you want to use yield() to be 'nice' for others, use cond_resched(). ++ * If you still want to use yield(), do not! ++ */ ++void __sched yield(void) ++{ ++ set_current_state(TASK_RUNNING); ++ do_sched_yield(); ++} ++EXPORT_SYMBOL(yield); ++ ++/** ++ * yield_to - yield the current processor to another thread in ++ * your thread group, or accelerate that thread toward the ++ * processor it's on. ++ * @p: target task ++ * @preempt: whether task preemption is allowed or not ++ * ++ * It's the caller's job to ensure that the target task struct ++ * can't go away on us before we can do any checks. ++ * ++ * Return: ++ * true (>0) if we indeed boosted the target task. ++ * false (0) if we failed to boost the target. ++ * -ESRCH if there's no task to yield to. ++ */ ++int __sched yield_to(struct task_struct *p, bool preempt) ++{ ++ struct task_struct *rq_p; ++ struct rq *rq, *p_rq; ++ unsigned long flags; ++ int yielded = 0; ++ ++ local_irq_save(flags); ++ rq = this_rq(); ++ ++again: ++ p_rq = task_rq(p); ++ /* ++ * If we're the only runnable task on the rq and target rq also ++ * has only one task, there's absolutely no point in yielding. ++ */ ++ if (task_running(p_rq, p) || p->state) { ++ yielded = -ESRCH; ++ goto out_irq; ++ } ++ ++ double_rq_lock(rq, p_rq); ++ if (unlikely(task_rq(p) != p_rq)) { ++ double_rq_unlock(rq, p_rq); ++ goto again; ++ } ++ ++ yielded = 1; ++ schedstat_inc(rq->yld_count); ++ rq_p = rq->curr; ++ if (p->deadline > rq_p->deadline) ++ p->deadline = rq_p->deadline; ++ p->time_slice += rq_p->time_slice; ++ if (p->time_slice > timeslice()) ++ p->time_slice = timeslice(); ++ time_slice_expired(rq_p, rq); ++ if (preempt && rq != p_rq) ++ resched_task(p_rq->curr); ++ double_rq_unlock(rq, p_rq); ++out_irq: ++ local_irq_restore(flags); ++ ++ if (yielded > 0) ++ schedule(); ++ return yielded; ++} ++EXPORT_SYMBOL_GPL(yield_to); ++ ++int io_schedule_prepare(void) ++{ ++ int old_iowait = current->in_iowait; ++ ++ current->in_iowait = 1; ++ blk_schedule_flush_plug(current); ++ ++ return old_iowait; ++} ++ ++void io_schedule_finish(int token) ++{ ++ current->in_iowait = token; ++} ++ ++/* ++ * This task is about to go to sleep on IO. Increment rq->nr_iowait so ++ * that process accounting knows that this is a task in IO wait state. ++ * ++ * But don't do that if it is a deliberate, throttling IO wait (this task ++ * has set its backing_dev_info: the queue against which it should throttle) ++ */ ++ ++long __sched io_schedule_timeout(long timeout) ++{ ++ int token; ++ long ret; ++ ++ token = io_schedule_prepare(); ++ ret = schedule_timeout(timeout); ++ io_schedule_finish(token); ++ ++ return ret; ++} ++EXPORT_SYMBOL(io_schedule_timeout); ++ ++void __sched io_schedule(void) ++{ ++ int token; ++ ++ token = io_schedule_prepare(); ++ schedule(); ++ io_schedule_finish(token); ++} ++EXPORT_SYMBOL(io_schedule); ++ ++/** ++ * sys_sched_get_priority_max - return maximum RT priority. ++ * @policy: scheduling class. ++ * ++ * Return: On success, this syscall returns the maximum ++ * rt_priority that can be used by a given scheduling class. ++ * On failure, a negative error code is returned. ++ */ ++SYSCALL_DEFINE1(sched_get_priority_max, int, policy) ++{ ++ int ret = -EINVAL; ++ ++ switch (policy) { ++ case SCHED_FIFO: ++ case SCHED_RR: ++ ret = MAX_USER_RT_PRIO-1; ++ break; ++ case SCHED_NORMAL: ++ case SCHED_BATCH: ++ case SCHED_ISO: ++ case SCHED_IDLEPRIO: ++ ret = 0; ++ break; ++ } ++ return ret; ++} ++ ++/** ++ * sys_sched_get_priority_min - return minimum RT priority. ++ * @policy: scheduling class. ++ * ++ * Return: On success, this syscall returns the minimum ++ * rt_priority that can be used by a given scheduling class. ++ * On failure, a negative error code is returned. ++ */ ++SYSCALL_DEFINE1(sched_get_priority_min, int, policy) ++{ ++ int ret = -EINVAL; ++ ++ switch (policy) { ++ case SCHED_FIFO: ++ case SCHED_RR: ++ ret = 1; ++ break; ++ case SCHED_NORMAL: ++ case SCHED_BATCH: ++ case SCHED_ISO: ++ case SCHED_IDLEPRIO: ++ ret = 0; ++ break; ++ } ++ return ret; ++} ++ ++static int sched_rr_get_interval(pid_t pid, struct timespec64 *t) ++{ ++ struct task_struct *p; ++ unsigned int time_slice; ++ struct rq_flags rf; ++ struct rq *rq; ++ int retval; ++ ++ if (pid < 0) ++ return -EINVAL; ++ ++ retval = -ESRCH; ++ rcu_read_lock(); ++ p = find_process_by_pid(pid); ++ if (!p) ++ goto out_unlock; ++ ++ retval = security_task_getscheduler(p); ++ if (retval) ++ goto out_unlock; ++ ++ rq = task_rq_lock(p, &rf); ++ time_slice = p->policy == SCHED_FIFO ? 0 : MS_TO_NS(task_timeslice(p)); ++ task_rq_unlock(rq, p, &rf); ++ ++ rcu_read_unlock(); ++ *t = ns_to_timespec64(time_slice); ++ return 0; ++ ++out_unlock: ++ rcu_read_unlock(); ++ return retval; ++} ++ ++/** ++ * sys_sched_rr_get_interval - return the default timeslice of a process. ++ * @pid: pid of the process. ++ * @interval: userspace pointer to the timeslice value. ++ * ++ * this syscall writes the default timeslice value of a given process ++ * into the user-space timespec buffer. A value of '0' means infinity. ++ * ++ * Return: On success, 0 and the timeslice is in @interval. Otherwise, ++ * an error code. ++ */ ++SYSCALL_DEFINE2(sched_rr_get_interval, pid_t, pid, ++ struct __kernel_timespec __user *, interval) ++{ ++ struct timespec64 t; ++ int retval = sched_rr_get_interval(pid, &t); ++ ++ if (retval == 0) ++ retval = put_timespec64(&t, interval); ++ ++ return retval; ++} ++ ++#ifdef CONFIG_COMPAT_32BIT_TIME ++SYSCALL_DEFINE2(sched_rr_get_interval_time32, pid_t, pid, ++ struct old_timespec32 __user *, interval) ++{ ++ struct timespec64 t; ++ int retval = sched_rr_get_interval(pid, &t); ++ ++ if (retval == 0) ++ retval = put_old_timespec32(&t, interval); ++ return retval; ++} ++#endif ++ ++void sched_show_task(struct task_struct *p) ++{ ++ unsigned long free = 0; ++ int ppid; ++ ++ if (!try_get_task_stack(p)) ++ return; ++ ++ printk(KERN_INFO "%-15.15s %c", p->comm, task_state_to_char(p)); ++ ++ if (p->state == TASK_RUNNING) ++ printk(KERN_CONT " running task "); ++#ifdef CONFIG_DEBUG_STACK_USAGE ++ free = stack_not_used(p); ++#endif ++ ppid = 0; ++ rcu_read_lock(); ++ if (pid_alive(p)) ++ ppid = task_pid_nr(rcu_dereference(p->real_parent)); ++ rcu_read_unlock(); ++ printk(KERN_CONT "%5lu %5d %6d 0x%08lx\n", free, ++ task_pid_nr(p), ppid, ++ (unsigned long)task_thread_info(p)->flags); ++ ++ print_worker_info(KERN_INFO, p); ++ show_stack(p, NULL); ++ put_task_stack(p); ++} ++EXPORT_SYMBOL_GPL(sched_show_task); ++ ++static inline bool ++state_filter_match(unsigned long state_filter, struct task_struct *p) ++{ ++ /* no filter, everything matches */ ++ if (!state_filter) ++ return true; ++ ++ /* filter, but doesn't match */ ++ if (!(p->state & state_filter)) ++ return false; ++ ++ /* ++ * When looking for TASK_UNINTERRUPTIBLE skip TASK_IDLE (allows ++ * TASK_KILLABLE). ++ */ ++ if (state_filter == TASK_UNINTERRUPTIBLE && p->state == TASK_IDLE) ++ return false; ++ ++ return true; ++} ++ ++void show_state_filter(unsigned long state_filter) ++{ ++ struct task_struct *g, *p; ++ ++#if BITS_PER_LONG == 32 ++ printk(KERN_INFO ++ " task PC stack pid father\n"); ++#else ++ printk(KERN_INFO ++ " task PC stack pid father\n"); ++#endif ++ rcu_read_lock(); ++ for_each_process_thread(g, p) { ++ /* ++ * reset the NMI-timeout, listing all files on a slow ++ * console might take a lot of time: ++ * Also, reset softlockup watchdogs on all CPUs, because ++ * another CPU might be blocked waiting for us to process ++ * an IPI. ++ */ ++ touch_nmi_watchdog(); ++ touch_all_softlockup_watchdogs(); ++ if (state_filter_match(state_filter, p)) ++ sched_show_task(p); ++ } ++ ++ rcu_read_unlock(); ++ /* ++ * Only show locks if all tasks are dumped: ++ */ ++ if (!state_filter) ++ debug_show_all_locks(); ++} ++ ++void dump_cpu_task(int cpu) ++{ ++ pr_info("Task dump for CPU %d:\n", cpu); ++ sched_show_task(cpu_curr(cpu)); ++} ++ ++#ifdef CONFIG_SMP ++void set_cpus_allowed_common(struct task_struct *p, const struct cpumask *new_mask) ++{ ++ cpumask_copy(&p->cpus_mask, new_mask); ++ p->nr_cpus_allowed = cpumask_weight(new_mask); ++} ++ ++void __do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask) ++{ ++ struct rq *rq = task_rq(p); ++ ++ lockdep_assert_held(&p->pi_lock); ++ ++ cpumask_copy(&p->cpus_mask, new_mask); ++ ++ if (task_queued(p)) { ++ /* ++ * Because __kthread_bind() calls this on blocked tasks without ++ * holding rq->lock. ++ */ ++ lockdep_assert_held(rq->lock); ++ } ++} ++ ++/* ++ * Calling do_set_cpus_allowed from outside the scheduler code should not be ++ * called on a running or queued task. We should be holding pi_lock. ++ */ ++void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask) ++{ ++ __do_set_cpus_allowed(p, new_mask); ++ if (needs_other_cpu(p, task_cpu(p))) { ++ struct rq *rq; ++ ++ rq = __task_rq_lock(p, NULL); ++ set_task_cpu(p, valid_task_cpu(p)); ++ resched_task(p); ++ __task_rq_unlock(rq, NULL); ++ } ++} ++#endif ++ ++/** ++ * init_idle - set up an idle thread for a given CPU ++ * @idle: task in question ++ * @cpu: cpu the idle task belongs to ++ * ++ * NOTE: this function does not set the idle thread's NEED_RESCHED ++ * flag, to make booting more robust. ++ */ ++void init_idle(struct task_struct *idle, int cpu) ++{ ++ struct rq *rq = cpu_rq(cpu); ++ unsigned long flags; ++ ++ raw_spin_lock_irqsave(&idle->pi_lock, flags); ++ raw_spin_lock(rq->lock); ++ idle->last_ran = rq->niffies; ++ time_slice_expired(idle, rq); ++ idle->state = TASK_RUNNING; ++ /* Setting prio to illegal value shouldn't matter when never queued */ ++ idle->prio = PRIO_LIMIT; ++ ++ kasan_unpoison_task_stack(idle); ++ ++#ifdef CONFIG_SMP ++ /* ++ * It's possible that init_idle() gets called multiple times on a task, ++ * in that case do_set_cpus_allowed() will not do the right thing. ++ * ++ * And since this is boot we can forgo the serialisation. ++ */ ++ set_cpus_allowed_common(idle, cpumask_of(cpu)); ++#ifdef CONFIG_SMT_NICE ++ idle->smt_bias = 0; ++#endif ++#endif ++ set_rq_task(rq, idle); ++ ++ /* Silence PROVE_RCU */ ++ rcu_read_lock(); ++ set_task_cpu(idle, cpu); ++ rcu_read_unlock(); ++ ++ rq->idle = idle; ++ rcu_assign_pointer(rq->curr, idle); ++ idle->on_rq = TASK_ON_RQ_QUEUED; ++ raw_spin_unlock(rq->lock); ++ raw_spin_unlock_irqrestore(&idle->pi_lock, flags); ++ ++ /* Set the preempt count _outside_ the spinlocks! */ ++ init_idle_preempt_count(idle, cpu); ++ ++ ftrace_graph_init_idle_task(idle, cpu); ++ vtime_init_idle(idle, cpu); ++#ifdef CONFIG_SMP ++ sprintf(idle->comm, "%s/%d", INIT_TASK_COMM, cpu); ++#endif ++} ++ ++int cpuset_cpumask_can_shrink(const struct cpumask __maybe_unused *cur, ++ const struct cpumask __maybe_unused *trial) ++{ ++ return 1; ++} ++ ++int task_can_attach(struct task_struct *p, ++ const struct cpumask *cs_cpus_allowed) ++{ ++ int ret = 0; ++ ++ /* ++ * Kthreads which disallow setaffinity shouldn't be moved ++ * to a new cpuset; we don't want to change their CPU ++ * affinity and isolating such threads by their set of ++ * allowed nodes is unnecessary. Thus, cpusets are not ++ * applicable for such threads. This prevents checking for ++ * success of set_cpus_allowed_ptr() on all attached tasks ++ * before cpus_mask may be changed. ++ */ ++ if (p->flags & PF_NO_SETAFFINITY) ++ ret = -EINVAL; ++ ++ return ret; ++} ++ ++void resched_cpu(int cpu) ++{ ++ struct rq *rq = cpu_rq(cpu); ++ struct rq_flags rf; ++ ++ rq_lock_irqsave(rq, &rf); ++ if (cpu_online(cpu) || cpu == smp_processor_id()) ++ resched_curr(rq); ++ rq_unlock_irqrestore(rq, &rf); ++} ++ ++#ifdef CONFIG_SMP ++#ifdef CONFIG_NO_HZ_COMMON ++void select_nohz_load_balancer(int stop_tick) ++{ ++} ++ ++void set_cpu_sd_state_idle(void) {} ++void nohz_balance_enter_idle(int cpu) {} ++ ++/* ++ * In the semi idle case, use the nearest busy CPU for migrating timers ++ * from an idle CPU. This is good for power-savings. ++ * ++ * We don't do similar optimization for completely idle system, as ++ * selecting an idle CPU will add more delays to the timers than intended ++ * (as that CPU's timer base may not be uptodate wrt jiffies etc). ++ */ ++int get_nohz_timer_target(void) ++{ ++ int i, cpu = smp_processor_id(); ++ struct sched_domain *sd; ++ ++ if (!idle_cpu(cpu) && housekeeping_cpu(cpu, HK_FLAG_TIMER)) ++ return cpu; ++ ++ rcu_read_lock(); ++ for_each_domain(cpu, sd) { ++ for_each_cpu(i, sched_domain_span(sd)) { ++ if (cpu == i) ++ continue; ++ ++ if (!idle_cpu(i) && housekeeping_cpu(i, HK_FLAG_TIMER)) { ++ cpu = i; ++ cpu = i; ++ goto unlock; ++ } ++ } ++ } ++ ++ if (!housekeeping_cpu(cpu, HK_FLAG_TIMER)) ++ cpu = housekeeping_any_cpu(HK_FLAG_TIMER); ++unlock: ++ rcu_read_unlock(); ++ return cpu; ++} ++ ++/* ++ * When add_timer_on() enqueues a timer into the timer wheel of an ++ * idle CPU then this timer might expire before the next timer event ++ * which is scheduled to wake up that CPU. In case of a completely ++ * idle system the next event might even be infinite time into the ++ * future. wake_up_idle_cpu() ensures that the CPU is woken up and ++ * leaves the inner idle loop so the newly added timer is taken into ++ * account when the CPU goes back to idle and evaluates the timer ++ * wheel for the next timer event. ++ */ ++void wake_up_idle_cpu(int cpu) ++{ ++ if (cpu == smp_processor_id()) ++ return; ++ ++ if (set_nr_and_not_polling(cpu_rq(cpu)->idle)) ++ smp_sched_reschedule(cpu); ++ else ++ trace_sched_wake_idle_without_ipi(cpu); ++} ++ ++static bool wake_up_full_nohz_cpu(int cpu) ++{ ++ /* ++ * We just need the target to call irq_exit() and re-evaluate ++ * the next tick. The nohz full kick at least implies that. ++ * If needed we can still optimize that later with an ++ * empty IRQ. ++ */ ++ if (cpu_is_offline(cpu)) ++ return true; /* Don't try to wake offline CPUs. */ ++ if (tick_nohz_full_cpu(cpu)) { ++ if (cpu != smp_processor_id() || ++ tick_nohz_tick_stopped()) ++ tick_nohz_full_kick_cpu(cpu); ++ return true; ++ } ++ ++ return false; ++} ++ ++/* ++ * Wake up the specified CPU. If the CPU is going offline, it is the ++ * caller's responsibility to deal with the lost wakeup, for example, ++ * by hooking into the CPU_DEAD notifier like timers and hrtimers do. ++ */ ++void wake_up_nohz_cpu(int cpu) ++{ ++ if (!wake_up_full_nohz_cpu(cpu)) ++ wake_up_idle_cpu(cpu); ++} ++#endif /* CONFIG_NO_HZ_COMMON */ ++ ++/* ++ * Change a given task's CPU affinity. Migrate the thread to a ++ * proper CPU and schedule it away if the CPU it's executing on ++ * is removed from the allowed bitmask. ++ * ++ * NOTE: the caller must have a valid reference to the task, the ++ * task must not exit() & deallocate itself prematurely. The ++ * call is not atomic; no spinlocks may be held. ++ */ ++static int __set_cpus_allowed_ptr(struct task_struct *p, ++ const struct cpumask *new_mask, bool check) ++{ ++ const struct cpumask *cpu_valid_mask = cpu_active_mask; ++ bool queued = false, running_wrong = false, kthread; ++ struct cpumask old_mask; ++ unsigned int dest_cpu; ++ struct rq_flags rf; ++ struct rq *rq; ++ int ret = 0; ++ ++ rq = task_rq_lock(p, &rf); ++ update_rq_clock(rq); ++ ++ kthread = !!(p->flags & PF_KTHREAD); ++ if (kthread) { ++ /* ++ * Kernel threads are allowed on online && !active CPUs ++ */ ++ cpu_valid_mask = cpu_online_mask; ++ } ++ ++ /* ++ * Must re-check here, to close a race against __kthread_bind(), ++ * sched_setaffinity() is not guaranteed to observe the flag. ++ */ ++ if (check && (p->flags & PF_NO_SETAFFINITY)) { ++ ret = -EINVAL; ++ goto out; ++ } ++ ++ cpumask_copy(&old_mask, p->cpus_ptr); ++ if (cpumask_equal(&old_mask, new_mask)) ++ goto out; ++ ++ dest_cpu = cpumask_any_and(cpu_valid_mask, new_mask); ++ if (dest_cpu >= nr_cpu_ids) { ++ ret = -EINVAL; ++ goto out; ++ } ++ ++ queued = task_queued(p); ++ __do_set_cpus_allowed(p, new_mask); ++ ++ if (kthread) { ++ /* ++ * For kernel threads that do indeed end up on online && ++ * !active we want to ensure they are strict per-CPU threads. ++ */ ++ WARN_ON(cpumask_intersects(new_mask, cpu_online_mask) && ++ !cpumask_intersects(new_mask, cpu_active_mask) && ++ p->nr_cpus_allowed != 1); ++ } ++ ++ /* Can the task run on the task's current CPU? If so, we're done */ ++ if (cpumask_test_cpu(task_cpu(p), new_mask)) ++ goto out; ++ ++ if (task_running(rq, p)) { ++ /* Task is running on the wrong cpu now, reschedule it. */ ++ if (rq == this_rq()) { ++ set_task_cpu(p, dest_cpu); ++ set_tsk_need_resched(p); ++ running_wrong = true; ++ } else ++ resched_task(p); ++ } else { ++ if (queued) { ++ /* ++ * Switch runqueue locks after dequeueing the task ++ * here while still holding the pi_lock to be holding ++ * the correct lock for enqueueing. ++ */ ++ dequeue_task(rq, p, 0); ++ rq_unlock(rq); ++ ++ rq = cpu_rq(dest_cpu); ++ rq_lock(rq); ++ } ++ set_task_cpu(p, dest_cpu); ++ if (queued) ++ enqueue_task(rq, p, 0); ++ } ++ if (queued) ++ try_preempt(p, rq); ++ if (running_wrong) ++ preempt_disable(); ++out: ++ task_rq_unlock(rq, p, &rf); ++ ++ if (running_wrong) { ++ __schedule(true); ++ preempt_enable(); ++ } ++ ++ return ret; ++} ++ ++int set_cpus_allowed_ptr(struct task_struct *p, const struct cpumask *new_mask) ++{ ++ return __set_cpus_allowed_ptr(p, new_mask, false); ++} ++EXPORT_SYMBOL_GPL(set_cpus_allowed_ptr); ++ ++#ifdef CONFIG_HOTPLUG_CPU ++/* ++ * Run through task list and find tasks affined to the dead cpu, then remove ++ * that cpu from the list, enable cpu0 and set the zerobound flag. Must hold ++ * cpu 0 and src_cpu's runqueue locks. We should be holding both rq lock and ++ * pi_lock to change cpus_mask but it's not going to matter here. ++ */ ++static void bind_zero(int src_cpu) ++{ ++ struct task_struct *p, *t; ++ struct rq *rq0; ++ int bound = 0; ++ ++ if (src_cpu == 0) ++ return; ++ ++ rq0 = cpu_rq(0); ++ ++ do_each_thread(t, p) { ++ if (cpumask_test_cpu(src_cpu, p->cpus_ptr)) { ++ bool local = (task_cpu(p) == src_cpu); ++ struct rq *rq = task_rq(p); ++ ++ /* task_running is the cpu stopper thread */ ++ if (local && task_running(rq, p)) ++ continue; ++ atomic_clear_cpu(src_cpu, &p->cpus_mask); ++ atomic_set_cpu(0, &p->cpus_mask); ++ p->zerobound = true; ++ bound++; ++ if (local) { ++ bool queued = task_queued(p); ++ ++ if (queued) ++ dequeue_task(rq, p, 0); ++ set_task_cpu(p, 0); ++ if (queued) ++ enqueue_task(rq0, p, 0); ++ } ++ } ++ } while_each_thread(t, p); ++ ++ if (bound) { ++ printk(KERN_INFO "MuQSS removed affinity for %d processes to cpu %d\n", ++ bound, src_cpu); ++ } ++} ++ ++/* Find processes with the zerobound flag and reenable their affinity for the ++ * CPU coming alive. */ ++static void unbind_zero(int src_cpu) ++{ ++ int unbound = 0, zerobound = 0; ++ struct task_struct *p, *t; ++ ++ if (src_cpu == 0) ++ return; ++ ++ do_each_thread(t, p) { ++ if (!p->mm) ++ p->zerobound = false; ++ if (p->zerobound) { ++ unbound++; ++ cpumask_set_cpu(src_cpu, &p->cpus_mask); ++ /* Once every CPU affinity has been re-enabled, remove ++ * the zerobound flag */ ++ if (cpumask_subset(cpu_possible_mask, p->cpus_ptr)) { ++ p->zerobound = false; ++ zerobound++; ++ } ++ } ++ } while_each_thread(t, p); ++ ++ if (unbound) { ++ printk(KERN_INFO "MuQSS added affinity for %d processes to cpu %d\n", ++ unbound, src_cpu); ++ } ++ if (zerobound) { ++ printk(KERN_INFO "MuQSS released forced binding to cpu0 for %d processes\n", ++ zerobound); ++ } ++} ++ ++/* ++ * Ensure that the idle task is using init_mm right before its cpu goes ++ * offline. ++ */ ++void idle_task_exit(void) ++{ ++ struct mm_struct *mm = current->active_mm; ++ ++ BUG_ON(cpu_online(smp_processor_id())); ++ ++ if (mm != &init_mm) { ++ switch_mm(mm, &init_mm, current); ++ current->active_mm = &init_mm; ++ finish_arch_post_lock_switch(); ++ } ++ mmdrop(mm); ++} ++#else /* CONFIG_HOTPLUG_CPU */ ++static void unbind_zero(int src_cpu) {} ++#endif /* CONFIG_HOTPLUG_CPU */ ++ ++void sched_set_stop_task(int cpu, struct task_struct *stop) ++{ ++ struct sched_param stop_param = { .sched_priority = STOP_PRIO }; ++ struct sched_param start_param = { .sched_priority = 0 }; ++ struct task_struct *old_stop = cpu_rq(cpu)->stop; ++ ++ if (stop) { ++ /* ++ * Make it appear like a SCHED_FIFO task, its something ++ * userspace knows about and won't get confused about. ++ * ++ * Also, it will make PI more or less work without too ++ * much confusion -- but then, stop work should not ++ * rely on PI working anyway. ++ */ ++ sched_setscheduler_nocheck(stop, SCHED_FIFO, &stop_param); ++ } ++ ++ cpu_rq(cpu)->stop = stop; ++ ++ if (old_stop) { ++ /* ++ * Reset it back to a normal scheduling policy so that ++ * it can die in pieces. ++ */ ++ sched_setscheduler_nocheck(old_stop, SCHED_NORMAL, &start_param); ++ } ++} ++ ++#if defined(CONFIG_SCHED_DEBUG) && defined(CONFIG_SYSCTL) ++ ++static struct ctl_table sd_ctl_dir[] = { ++ { ++ .procname = "sched_domain", ++ .mode = 0555, ++ }, ++ {} ++}; ++ ++static struct ctl_table sd_ctl_root[] = { ++ { ++ .procname = "kernel", ++ .mode = 0555, ++ .child = sd_ctl_dir, ++ }, ++ {} ++}; ++ ++static struct ctl_table *sd_alloc_ctl_entry(int n) ++{ ++ struct ctl_table *entry = ++ kcalloc(n, sizeof(struct ctl_table), GFP_KERNEL); ++ ++ return entry; ++} ++ ++static void sd_free_ctl_entry(struct ctl_table **tablep) ++{ ++ struct ctl_table *entry; ++ ++ /* ++ * In the intermediate directories, both the child directory and ++ * procname are dynamically allocated and could fail but the mode ++ * will always be set. In the lowest directory the names are ++ * static strings and all have proc handlers. ++ */ ++ for (entry = *tablep; entry->mode; entry++) { ++ if (entry->child) ++ sd_free_ctl_entry(&entry->child); ++ if (entry->proc_handler == NULL) ++ kfree(entry->procname); ++ } ++ ++ kfree(*tablep); ++ *tablep = NULL; ++} ++ ++static void ++set_table_entry(struct ctl_table *entry, ++ const char *procname, void *data, int maxlen, ++ umode_t mode, proc_handler *proc_handler) ++{ ++ entry->procname = procname; ++ entry->data = data; ++ entry->maxlen = maxlen; ++ entry->mode = mode; ++ entry->proc_handler = proc_handler; ++} ++ ++static struct ctl_table * ++sd_alloc_ctl_domain_table(struct sched_domain *sd) ++{ ++ struct ctl_table *table = sd_alloc_ctl_entry(9); ++ ++ if (table == NULL) ++ return NULL; ++ ++ set_table_entry(&table[0], "min_interval", &sd->min_interval, sizeof(long), 0644, proc_doulongvec_minmax); ++ set_table_entry(&table[1], "max_interval", &sd->max_interval, sizeof(long), 0644, proc_doulongvec_minmax); ++ set_table_entry(&table[2], "busy_factor", &sd->busy_factor, sizeof(int), 0644, proc_dointvec_minmax); ++ set_table_entry(&table[3], "imbalance_pct", &sd->imbalance_pct, sizeof(int), 0644, proc_dointvec_minmax); ++ set_table_entry(&table[4], "cache_nice_tries", &sd->cache_nice_tries, sizeof(int), 0644, proc_dointvec_minmax); ++ set_table_entry(&table[5], "flags", &sd->flags, sizeof(int), 0644, proc_dointvec_minmax); ++ set_table_entry(&table[6], "max_newidle_lb_cost", &sd->max_newidle_lb_cost, sizeof(long), 0644, proc_doulongvec_minmax); ++ set_table_entry(&table[7], "name", sd->name, CORENAME_MAX_SIZE, 0444, proc_dostring); ++ /* &table[8] is terminator */ ++ ++ return table; ++} ++ ++static struct ctl_table *sd_alloc_ctl_cpu_table(int cpu) ++{ ++ struct ctl_table *entry, *table; ++ struct sched_domain *sd; ++ int domain_num = 0, i; ++ char buf[32]; ++ ++ for_each_domain(cpu, sd) ++ domain_num++; ++ entry = table = sd_alloc_ctl_entry(domain_num + 1); ++ if (table == NULL) ++ return NULL; ++ ++ i = 0; ++ for_each_domain(cpu, sd) { ++ snprintf(buf, 32, "domain%d", i); ++ entry->procname = kstrdup(buf, GFP_KERNEL); ++ entry->mode = 0555; ++ entry->child = sd_alloc_ctl_domain_table(sd); ++ entry++; ++ i++; ++ } ++ return table; ++} ++ ++static cpumask_var_t sd_sysctl_cpus; ++static struct ctl_table_header *sd_sysctl_header; ++ ++void register_sched_domain_sysctl(void) ++{ ++ static struct ctl_table *cpu_entries; ++ static struct ctl_table **cpu_idx; ++ char buf[32]; ++ int i; ++ ++ if (!cpu_entries) { ++ cpu_entries = sd_alloc_ctl_entry(num_possible_cpus() + 1); ++ if (!cpu_entries) ++ return; ++ ++ WARN_ON(sd_ctl_dir[0].child); ++ sd_ctl_dir[0].child = cpu_entries; ++ } ++ ++ if (!cpu_idx) { ++ struct ctl_table *e = cpu_entries; ++ ++ cpu_idx = kcalloc(nr_cpu_ids, sizeof(struct ctl_table*), GFP_KERNEL); ++ if (!cpu_idx) ++ return; ++ ++ /* deal with sparse possible map */ ++ for_each_possible_cpu(i) { ++ cpu_idx[i] = e; ++ e++; ++ } ++ } ++ ++ if (!cpumask_available(sd_sysctl_cpus)) { ++ if (!alloc_cpumask_var(&sd_sysctl_cpus, GFP_KERNEL)) ++ return; ++ ++ /* init to possible to not have holes in @cpu_entries */ ++ cpumask_copy(sd_sysctl_cpus, cpu_possible_mask); ++ } ++ ++ for_each_cpu(i, sd_sysctl_cpus) { ++ struct ctl_table *e = cpu_idx[i]; ++ ++ if (e->child) ++ sd_free_ctl_entry(&e->child); ++ ++ if (!e->procname) { ++ snprintf(buf, 32, "cpu%d", i); ++ e->procname = kstrdup(buf, GFP_KERNEL); ++ } ++ e->mode = 0555; ++ e->child = sd_alloc_ctl_cpu_table(i); ++ ++ __cpumask_clear_cpu(i, sd_sysctl_cpus); ++ } ++ ++ WARN_ON(sd_sysctl_header); ++ sd_sysctl_header = register_sysctl_table(sd_ctl_root); ++} ++ ++void dirty_sched_domain_sysctl(int cpu) ++{ ++ if (cpumask_available(sd_sysctl_cpus)) ++ __cpumask_set_cpu(cpu, sd_sysctl_cpus); ++} ++ ++/* may be called multiple times per register */ ++void unregister_sched_domain_sysctl(void) ++{ ++ unregister_sysctl_table(sd_sysctl_header); ++ sd_sysctl_header = NULL; ++} ++#endif /* CONFIG_SYSCTL */ ++ ++void set_rq_online(struct rq *rq) ++{ ++ if (!rq->online) { ++ cpumask_set_cpu(cpu_of(rq), rq->rd->online); ++ rq->online = true; ++ } ++} ++ ++void set_rq_offline(struct rq *rq) ++{ ++ if (rq->online) { ++ int cpu = cpu_of(rq); ++ ++ cpumask_clear_cpu(cpu, rq->rd->online); ++ rq->online = false; ++ clear_cpuidle_map(cpu); ++ } ++} ++ ++/* ++ * used to mark begin/end of suspend/resume: ++ */ ++static int num_cpus_frozen; ++ ++/* ++ * Update cpusets according to cpu_active mask. If cpusets are ++ * disabled, cpuset_update_active_cpus() becomes a simple wrapper ++ * around partition_sched_domains(). ++ * ++ * If we come here as part of a suspend/resume, don't touch cpusets because we ++ * want to restore it back to its original state upon resume anyway. ++ */ ++static void cpuset_cpu_active(void) ++{ ++ if (cpuhp_tasks_frozen) { ++ /* ++ * num_cpus_frozen tracks how many CPUs are involved in suspend ++ * resume sequence. As long as this is not the last online ++ * operation in the resume sequence, just build a single sched ++ * domain, ignoring cpusets. ++ */ ++ partition_sched_domains(1, NULL, NULL); ++ if (--num_cpus_frozen) ++ return; ++ /* ++ * This is the last CPU online operation. So fall through and ++ * restore the original sched domains by considering the ++ * cpuset configurations. ++ */ ++ cpuset_force_rebuild(); ++ } ++ ++ cpuset_update_active_cpus(); ++} ++ ++static int cpuset_cpu_inactive(unsigned int cpu) ++{ ++ if (!cpuhp_tasks_frozen) { ++ cpuset_update_active_cpus(); ++ } else { ++ num_cpus_frozen++; ++ partition_sched_domains(1, NULL, NULL); ++ } ++ return 0; ++} ++ ++int sched_cpu_activate(unsigned int cpu) ++{ ++ struct rq *rq = cpu_rq(cpu); ++ struct rq_flags rf; ++ ++#ifdef CONFIG_SCHED_SMT ++ /* ++ * When going up, increment the number of cores with SMT present. ++ */ ++ if (cpumask_weight(cpu_smt_mask(cpu)) == 2) ++ static_branch_inc_cpuslocked(&sched_smt_present); ++#endif ++ set_cpu_active(cpu, true); ++ ++ if (sched_smp_initialized) { ++ sched_domains_numa_masks_set(cpu); ++ cpuset_cpu_active(); ++ } ++ ++ /* ++ * Put the rq online, if not already. This happens: ++ * ++ * 1) In the early boot process, because we build the real domains ++ * after all CPUs have been brought up. ++ * ++ * 2) At runtime, if cpuset_cpu_active() fails to rebuild the ++ * domains. ++ */ ++ rq_lock_irqsave(rq, &rf); ++ if (rq->rd) { ++ BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span)); ++ set_rq_online(rq); ++ } ++ unbind_zero(cpu); ++ rq_unlock_irqrestore(rq, &rf); ++ ++ return 0; ++} ++ ++int sched_cpu_deactivate(unsigned int cpu) ++{ ++ int ret; ++ ++ set_cpu_active(cpu, false); ++ /* ++ * We've cleared cpu_active_mask, wait for all preempt-disabled and RCU ++ * users of this state to go away such that all new such users will ++ * observe it. ++ * ++ * Do sync before park smpboot threads to take care the rcu boost case. ++ */ ++ synchronize_rcu(); ++ ++#ifdef CONFIG_SCHED_SMT ++ /* ++ * When going down, decrement the number of cores with SMT present. ++ */ ++ if (cpumask_weight(cpu_smt_mask(cpu)) == 2) ++ static_branch_dec_cpuslocked(&sched_smt_present); ++#endif ++ ++ if (!sched_smp_initialized) ++ return 0; ++ ++ ret = cpuset_cpu_inactive(cpu); ++ if (ret) { ++ set_cpu_active(cpu, true); ++ return ret; ++ } ++ sched_domains_numa_masks_clear(cpu); ++ return 0; ++} ++ ++int sched_cpu_starting(unsigned int cpu) ++{ ++ sched_tick_start(cpu); ++ return 0; ++} ++ ++#ifdef CONFIG_HOTPLUG_CPU ++int sched_cpu_dying(unsigned int cpu) ++{ ++ struct rq *rq = cpu_rq(cpu); ++ unsigned long flags; ++ ++ /* Handle pending wakeups and then migrate everything off */ ++ sched_ttwu_pending(); ++ sched_tick_stop(cpu); ++ ++ local_irq_save(flags); ++ double_rq_lock(rq, cpu_rq(0)); ++ if (rq->rd) { ++ BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span)); ++ set_rq_offline(rq); ++ } ++ bind_zero(cpu); ++ double_rq_unlock(rq, cpu_rq(0)); ++ sched_start_tick(rq, cpu); ++ hrexpiry_clear(rq); ++ local_irq_restore(flags); ++ ++ return 0; ++} ++#endif ++ ++#if defined(CONFIG_SCHED_SMT) || defined(CONFIG_SCHED_MC) ++/* ++ * Cheaper version of the below functions in case support for SMT and MC is ++ * compiled in but CPUs have no siblings. ++ */ ++static bool sole_cpu_idle(struct rq *rq) ++{ ++ return rq_idle(rq); ++} ++#endif ++#ifdef CONFIG_SCHED_SMT ++static const cpumask_t *thread_cpumask(int cpu) ++{ ++ return topology_sibling_cpumask(cpu); ++} ++/* All this CPU's SMT siblings are idle */ ++static bool siblings_cpu_idle(struct rq *rq) ++{ ++ return cpumask_subset(&rq->thread_mask, &cpu_idle_map); ++} ++#endif ++#ifdef CONFIG_SCHED_MC ++static const cpumask_t *core_cpumask(int cpu) ++{ ++ return topology_core_cpumask(cpu); ++} ++/* All this CPU's shared cache siblings are idle */ ++static bool cache_cpu_idle(struct rq *rq) ++{ ++ return cpumask_subset(&rq->core_mask, &cpu_idle_map); ++} ++/* MC siblings CPU mask which share the same LLC */ ++static const cpumask_t *llc_core_cpumask(int cpu) ++{ ++ return per_cpu(cpu_llc_shared_map, cpu); ++} ++#endif ++ ++enum sched_domain_level { ++ SD_LV_NONE = 0, ++ SD_LV_SIBLING, ++ SD_LV_MC, ++ SD_LV_BOOK, ++ SD_LV_CPU, ++ SD_LV_NODE, ++ SD_LV_ALLNODES, ++ SD_LV_MAX ++}; ++ ++void __init sched_init_smp(void) ++{ ++ struct rq *rq, *other_rq, *leader = cpu_rq(0); ++ struct sched_domain *sd; ++ int cpu, other_cpu, i; ++#ifdef CONFIG_SCHED_SMT ++ bool smt_threads = false; ++#endif ++ sched_init_numa(); ++ ++ /* ++ * There's no userspace yet to cause hotplug operations; hence all the ++ * cpu masks are stable and all blatant races in the below code cannot ++ * happen. ++ */ ++ mutex_lock(&sched_domains_mutex); ++ sched_init_domains(cpu_active_mask); ++ mutex_unlock(&sched_domains_mutex); ++ ++ /* Move init over to a non-isolated CPU */ ++ if (set_cpus_allowed_ptr(current, housekeeping_cpumask(HK_FLAG_DOMAIN)) < 0) ++ BUG(); ++ ++ local_irq_disable(); ++ mutex_lock(&sched_domains_mutex); ++ lock_all_rqs(); ++ ++ printk(KERN_INFO "MuQSS possible/present/online CPUs: %d/%d/%d\n", ++ num_possible_cpus(), num_present_cpus(), num_online_cpus()); ++ ++ /* ++ * Set up the relative cache distance of each online cpu from each ++ * other in a simple array for quick lookup. Locality is determined ++ * by the closest sched_domain that CPUs are separated by. CPUs with ++ * shared cache in SMT and MC are treated as local. Separate CPUs ++ * (within the same package or physically) within the same node are ++ * treated as not local. CPUs not even in the same domain (different ++ * nodes) are treated as very distant. ++ */ ++ for (cpu = num_online_cpus() - 1; cpu >= 0; cpu--) { ++ rq = cpu_rq(cpu); ++ leader = NULL; ++ /* First check if this cpu is in the same node */ ++ for_each_domain(cpu, sd) { ++ if (sd->level > SD_LV_MC) ++ continue; ++ if (rqshare != RQSHARE_ALL) ++ leader = NULL; ++ /* Set locality to local node if not already found lower */ ++ for_each_cpu(other_cpu, sched_domain_span(sd)) { ++ if (rqshare >= RQSHARE_SMP) { ++ other_rq = cpu_rq(other_cpu); ++ ++ /* Set the smp_leader to the first CPU */ ++ if (!leader) ++ leader = rq; ++ other_rq->smp_leader = leader; ++ } ++ if (rq->cpu_locality[other_cpu] > LOCALITY_SMP) ++ rq->cpu_locality[other_cpu] = LOCALITY_SMP; ++ } ++ } ++ ++ /* ++ * Each runqueue has its own function in case it doesn't have ++ * siblings of its own allowing mixed topologies. ++ */ ++#ifdef CONFIG_SCHED_MC ++ leader = NULL; ++ if (cpumask_weight(core_cpumask(cpu)) > 1) { ++ cpumask_copy(&rq->core_mask, llc_core_cpumask(cpu)); ++ cpumask_clear_cpu(cpu, &rq->core_mask); ++ for_each_cpu(other_cpu, core_cpumask(cpu)) { ++ if (rqshare == RQSHARE_MC || ++ (rqshare == RQSHARE_MC_LLC && cpumask_test_cpu(other_cpu, llc_core_cpumask(cpu)))) { ++ other_rq = cpu_rq(other_cpu); ++ ++ /* Set the mc_leader to the first CPU */ ++ if (!leader) ++ leader = rq; ++ other_rq->mc_leader = leader; ++ } ++ if (rq->cpu_locality[other_cpu] > LOCALITY_MC) { ++ /* this is to get LLC into play even in case LLC sharing is not used */ ++ if (cpumask_test_cpu(other_cpu, llc_core_cpumask(cpu))) ++ rq->cpu_locality[other_cpu] = LOCALITY_MC_LLC; ++ else ++ rq->cpu_locality[other_cpu] = LOCALITY_MC; ++ } ++ } ++ rq->cache_idle = cache_cpu_idle; ++ } ++#endif ++#ifdef CONFIG_SCHED_SMT ++ leader = NULL; ++ if (cpumask_weight(thread_cpumask(cpu)) > 1) { ++ cpumask_copy(&rq->thread_mask, thread_cpumask(cpu)); ++ cpumask_clear_cpu(cpu, &rq->thread_mask); ++ for_each_cpu(other_cpu, thread_cpumask(cpu)) { ++ if (rqshare == RQSHARE_SMT) { ++ other_rq = cpu_rq(other_cpu); ++ ++ /* Set the smt_leader to the first CPU */ ++ if (!leader) ++ leader = rq; ++ other_rq->smt_leader = leader; ++ } ++ if (rq->cpu_locality[other_cpu] > LOCALITY_SMT) ++ rq->cpu_locality[other_cpu] = LOCALITY_SMT; ++ } ++ rq->siblings_idle = siblings_cpu_idle; ++ smt_threads = true; ++ } ++#endif ++ } ++ ++#ifdef CONFIG_SMT_NICE ++ if (smt_threads) { ++ check_siblings = &check_smt_siblings; ++ wake_siblings = &wake_smt_siblings; ++ smt_schedule = &smt_should_schedule; ++ } ++#endif ++ unlock_all_rqs(); ++ mutex_unlock(&sched_domains_mutex); ++ ++ for_each_online_cpu(cpu) { ++ rq = cpu_rq(cpu); ++ for_each_online_cpu(other_cpu) { ++ printk(KERN_DEBUG "MuQSS locality CPU %d to %d: %d\n", cpu, other_cpu, rq->cpu_locality[other_cpu]); ++ } ++ } ++ ++ for_each_online_cpu(cpu) { ++ rq = cpu_rq(cpu); ++ leader = rq->smp_leader; ++ ++ rq_lock(rq); ++ if (leader && rq != leader) { ++ printk(KERN_INFO "MuQSS sharing SMP runqueue from CPU %d to CPU %d\n", ++ leader->cpu, rq->cpu); ++ kfree(rq->node); ++ kfree(rq->sl); ++ kfree(rq->lock); ++ rq->node = leader->node; ++ rq->sl = leader->sl; ++ rq->lock = leader->lock; ++ barrier(); ++ /* To make up for not unlocking the freed runlock */ ++ preempt_enable(); ++ } else ++ rq_unlock(rq); ++ } ++ ++#ifdef CONFIG_SCHED_MC ++ for_each_online_cpu(cpu) { ++ rq = cpu_rq(cpu); ++ leader = rq->mc_leader; ++ ++ rq_lock(rq); ++ if (leader && rq != leader) { ++ printk(KERN_INFO "MuQSS sharing MC runqueue from CPU %d to CPU %d\n", ++ leader->cpu, rq->cpu); ++ kfree(rq->node); ++ kfree(rq->sl); ++ kfree(rq->lock); ++ rq->node = leader->node; ++ rq->sl = leader->sl; ++ rq->lock = leader->lock; ++ barrier(); ++ /* To make up for not unlocking the freed runlock */ ++ preempt_enable(); ++ } else ++ rq_unlock(rq); ++ } ++#endif /* CONFIG_SCHED_MC */ ++ ++#ifdef CONFIG_SCHED_SMT ++ for_each_online_cpu(cpu) { ++ rq = cpu_rq(cpu); ++ ++ leader = rq->smt_leader; ++ ++ rq_lock(rq); ++ if (leader && rq != leader) { ++ printk(KERN_INFO "MuQSS sharing SMT runqueue from CPU %d to CPU %d\n", ++ leader->cpu, rq->cpu); ++ kfree(rq->node); ++ kfree(rq->sl); ++ kfree(rq->lock); ++ rq->node = leader->node; ++ rq->sl = leader->sl; ++ rq->lock = leader->lock; ++ barrier(); ++ /* To make up for not unlocking the freed runlock */ ++ preempt_enable(); ++ } else ++ rq_unlock(rq); ++ } ++#endif /* CONFIG_SCHED_SMT */ ++ ++ local_irq_enable(); ++ ++ total_runqueues = 0; ++ for_each_online_cpu(cpu) { ++ int locality, total_rqs = 0, total_cpus = 0; ++ ++ rq = cpu_rq(cpu); ++ if ( ++#ifdef CONFIG_SCHED_MC ++ (rq->mc_leader == rq) && ++#endif ++#ifdef CONFIG_SCHED_SMT ++ (rq->smt_leader == rq) && ++#endif ++ (rq->smp_leader == rq)) { ++ total_runqueues++; ++ } ++ ++ for (locality = LOCALITY_SAME; locality <= LOCALITY_DISTANT; locality++) { ++ int selected_cpus[NR_CPUS], selected_cpu_cnt, selected_cpu_idx, test_cpu_idx, cpu_idx, best_locality, test_cpu; ++ int ordered_cpus[NR_CPUS], ordered_cpus_idx; ++ ++ ordered_cpus_idx = -1; ++ selected_cpu_cnt = 0; ++ ++ for_each_online_cpu(test_cpu) { ++ if (cpu < num_online_cpus() / 2) ++ other_cpu = cpu + test_cpu; ++ else ++ other_cpu = cpu - test_cpu; ++ if (other_cpu < 0) ++ other_cpu += num_online_cpus(); ++ else ++ other_cpu %= num_online_cpus(); ++ /* gather CPUs of the same locality */ ++ if (rq->cpu_locality[other_cpu] == locality) { ++ selected_cpus[selected_cpu_cnt] = other_cpu; ++ selected_cpu_cnt++; ++ } ++ } ++ ++ /* reserve first CPU as starting point */ ++ if (selected_cpu_cnt > 0) { ++ ordered_cpus_idx++; ++ ordered_cpus[ordered_cpus_idx] = selected_cpus[ordered_cpus_idx]; ++ selected_cpus[ordered_cpus_idx] = -1; ++ } ++ ++ /* take each CPU and sort it within the same locality based on each inter-CPU localities */ ++ for(test_cpu_idx = 1; test_cpu_idx < selected_cpu_cnt; test_cpu_idx++) { ++ /* starting point with worst locality and current CPU */ ++ best_locality = LOCALITY_DISTANT; ++ selected_cpu_idx = test_cpu_idx; ++ ++ /* try to find the best locality within group */ ++ for(cpu_idx = 1; cpu_idx < selected_cpu_cnt; cpu_idx++) { ++ /* if CPU has not been used and locality is better */ ++ if (selected_cpus[cpu_idx] > -1) { ++ other_rq = cpu_rq(ordered_cpus[ordered_cpus_idx]); ++ if (best_locality > other_rq->cpu_locality[selected_cpus[cpu_idx]]) { ++ /* assign best locality and best CPU idx in array */ ++ best_locality = other_rq->cpu_locality[selected_cpus[cpu_idx]]; ++ selected_cpu_idx = cpu_idx; ++ } ++ } ++ } ++ ++ /* add our next best CPU to ordered list */ ++ ordered_cpus_idx++; ++ ordered_cpus[ordered_cpus_idx] = selected_cpus[selected_cpu_idx]; ++ /* mark this CPU as used */ ++ selected_cpus[selected_cpu_idx] = -1; ++ } ++ ++ /* set up RQ and CPU orders */ ++ for (test_cpu = 0; test_cpu <= ordered_cpus_idx; test_cpu++) { ++ other_rq = cpu_rq(ordered_cpus[test_cpu]); ++ /* set up cpu orders */ ++ rq->cpu_order[total_cpus++] = other_rq; ++ if ( ++#ifdef CONFIG_SCHED_MC ++ (other_rq->mc_leader == other_rq) && ++#endif ++#ifdef CONFIG_SCHED_SMT ++ (other_rq->smt_leader == other_rq) && ++#endif ++ (other_rq->smp_leader == other_rq)) { ++ /* set up RQ orders */ ++ rq->rq_order[total_rqs++] = other_rq; ++ } ++ } ++ } ++ } ++ ++ for_each_online_cpu(cpu) { ++ rq = cpu_rq(cpu); ++ for (i = 0; i < total_runqueues; i++) { ++ printk(KERN_DEBUG "MuQSS CPU %d llc %d RQ order %d RQ %d llc %d\n", cpu, per_cpu(cpu_llc_id, cpu), i, ++ rq->rq_order[i]->cpu, per_cpu(cpu_llc_id, rq->rq_order[i]->cpu)); ++ } ++ } ++ ++ for_each_online_cpu(cpu) { ++ rq = cpu_rq(cpu); ++ for (i = 0; i < num_online_cpus(); i++) { ++ printk(KERN_DEBUG "MuQSS CPU %d llc %d CPU order %d RQ %d llc %d\n", cpu, per_cpu(cpu_llc_id, cpu), i, ++ rq->cpu_order[i]->cpu, per_cpu(cpu_llc_id, rq->cpu_order[i]->cpu)); ++ } ++ } ++ ++ switch (rqshare) { ++ case RQSHARE_ALL: ++ /* This should only ever read 1 */ ++ printk(KERN_INFO "MuQSS runqueue share type ALL total runqueues: %d\n", ++ total_runqueues); ++ break; ++ case RQSHARE_SMP: ++ printk(KERN_INFO "MuQSS runqueue share type SMP total runqueues: %d\n", ++ total_runqueues); ++ break; ++ case RQSHARE_MC: ++ printk(KERN_INFO "MuQSS runqueue share type MC total runqueues: %d\n", ++ total_runqueues); ++ break; ++ case RQSHARE_MC_LLC: ++ printk(KERN_INFO "MuQSS runqueue share type LLC total runqueues: %d\n", ++ total_runqueues); ++ break; ++ case RQSHARE_SMT: ++ printk(KERN_INFO "MuQSS runqueue share type SMT total runqueues: %d\n", ++ total_runqueues); ++ break; ++ case RQSHARE_NONE: ++ printk(KERN_INFO "MuQSS runqueue share type NONE total runqueues: %d\n", ++ total_runqueues); ++ break; ++ } ++ ++ sched_smp_initialized = true; ++} ++#else ++void __init sched_init_smp(void) ++{ ++ sched_smp_initialized = true; ++} ++#endif /* CONFIG_SMP */ ++ ++int in_sched_functions(unsigned long addr) ++{ ++ return in_lock_functions(addr) || ++ (addr >= (unsigned long)__sched_text_start ++ && addr < (unsigned long)__sched_text_end); ++} ++ ++#ifdef CONFIG_CGROUP_SCHED ++/* task group related information */ ++struct task_group { ++ struct cgroup_subsys_state css; ++ ++ struct rcu_head rcu; ++ struct list_head list; ++ ++ struct task_group *parent; ++ struct list_head siblings; ++ struct list_head children; ++}; ++ ++/* ++ * Default task group. ++ * Every task in system belongs to this group at bootup. ++ */ ++struct task_group root_task_group; ++LIST_HEAD(task_groups); ++ ++/* Cacheline aligned slab cache for task_group */ ++static struct kmem_cache *task_group_cache __read_mostly; ++#endif /* CONFIG_CGROUP_SCHED */ ++ ++void __init sched_init(void) ++{ ++#ifdef CONFIG_SMP ++ int cpu_ids; ++#endif ++ int i; ++ struct rq *rq; ++ ++ wait_bit_init(); ++ ++ prio_ratios[0] = 128; ++ for (i = 1 ; i < NICE_WIDTH ; i++) ++ prio_ratios[i] = prio_ratios[i - 1] * 11 / 10; ++ ++ skiplist_node_init(&init_task.node); ++ ++#ifdef CONFIG_SMP ++ init_defrootdomain(); ++ cpumask_clear(&cpu_idle_map); ++#else ++ uprq = &per_cpu(runqueues, 0); ++#endif ++ ++#ifdef CONFIG_CGROUP_SCHED ++ task_group_cache = KMEM_CACHE(task_group, 0); ++ ++ list_add(&root_task_group.list, &task_groups); ++ INIT_LIST_HEAD(&root_task_group.children); ++ INIT_LIST_HEAD(&root_task_group.siblings); ++#endif /* CONFIG_CGROUP_SCHED */ ++ for_each_possible_cpu(i) { ++ rq = cpu_rq(i); ++ rq->node = kmalloc(sizeof(skiplist_node), GFP_ATOMIC); ++ skiplist_init(rq->node); ++ rq->sl = new_skiplist(rq->node); ++ rq->lock = kmalloc(sizeof(raw_spinlock_t), GFP_ATOMIC); ++ raw_spin_lock_init(rq->lock); ++ rq->nr_running = 0; ++ rq->nr_uninterruptible = 0; ++ rq->nr_switches = 0; ++ rq->clock = rq->old_clock = rq->last_niffy = rq->niffies = 0; ++ rq->last_jiffy = jiffies; ++ rq->user_ns = rq->nice_ns = rq->softirq_ns = rq->system_ns = ++ rq->iowait_ns = rq->idle_ns = 0; ++ rq->dither = 0; ++ set_rq_task(rq, &init_task); ++ rq->iso_ticks = 0; ++ rq->iso_refractory = false; ++#ifdef CONFIG_SMP ++ rq->smp_leader = rq; ++#ifdef CONFIG_SCHED_MC ++ rq->mc_leader = rq; ++#endif ++#ifdef CONFIG_SCHED_SMT ++ rq->smt_leader = rq; ++#endif ++ rq->sd = NULL; ++ rq->rd = NULL; ++ rq->online = false; ++ rq->cpu = i; ++ rq_attach_root(rq, &def_root_domain); ++#endif ++ init_rq_hrexpiry(rq); ++ atomic_set(&rq->nr_iowait, 0); ++ } ++ ++#ifdef CONFIG_SMP ++ cpu_ids = i; ++ /* ++ * Set the base locality for cpu cache distance calculation to ++ * "distant" (3). Make sure the distance from a CPU to itself is 0. ++ */ ++ for_each_possible_cpu(i) { ++ int j; ++ ++ rq = cpu_rq(i); ++#ifdef CONFIG_SCHED_SMT ++ rq->siblings_idle = sole_cpu_idle; ++#endif ++#ifdef CONFIG_SCHED_MC ++ rq->cache_idle = sole_cpu_idle; ++#endif ++ rq->cpu_locality = kmalloc(cpu_ids * sizeof(int *), GFP_ATOMIC); ++ for_each_possible_cpu(j) { ++ if (i == j) ++ rq->cpu_locality[j] = LOCALITY_SAME; ++ else ++ rq->cpu_locality[j] = LOCALITY_DISTANT; ++ } ++ rq->rq_order = kmalloc(cpu_ids * sizeof(struct rq *), GFP_ATOMIC); ++ rq->cpu_order = kmalloc(cpu_ids * sizeof(struct rq *), GFP_ATOMIC); ++ rq->rq_order[0] = rq->cpu_order[0] = rq; ++ for (j = 1; j < cpu_ids; j++) ++ rq->rq_order[j] = rq->cpu_order[j] = cpu_rq(j); ++ } ++#endif ++ ++ /* ++ * The boot idle thread does lazy MMU switching as well: ++ */ ++ mmgrab(&init_mm); ++ enter_lazy_tlb(&init_mm, current); ++ ++ /* ++ * Make us the idle thread. Technically, schedule() should not be ++ * called from this thread, however somewhere below it might be, ++ * but because we are the idle thread, we just pick up running again ++ * when this runqueue becomes "idle". ++ */ ++ init_idle(current, smp_processor_id()); ++ ++#ifdef CONFIG_SMP ++ idle_thread_set_boot_cpu(); ++#endif /* SMP */ ++ ++ init_schedstats(); ++ ++ psi_init(); ++} ++ ++#ifdef CONFIG_DEBUG_ATOMIC_SLEEP ++static inline int preempt_count_equals(int preempt_offset) ++{ ++ int nested = preempt_count() + rcu_preempt_depth(); ++ ++ return (nested == preempt_offset); ++} ++ ++void __might_sleep(const char *file, int line, int preempt_offset) ++{ ++ /* ++ * Blocking primitives will set (and therefore destroy) current->state, ++ * since we will exit with TASK_RUNNING make sure we enter with it, ++ * otherwise we will destroy state. ++ */ ++ WARN_ONCE(current->state != TASK_RUNNING && current->task_state_change, ++ "do not call blocking ops when !TASK_RUNNING; " ++ "state=%lx set at [<%p>] %pS\n", ++ current->state, ++ (void *)current->task_state_change, ++ (void *)current->task_state_change); ++ ++ ___might_sleep(file, line, preempt_offset); ++} ++EXPORT_SYMBOL(__might_sleep); ++ ++void __cant_sleep(const char *file, int line, int preempt_offset) ++{ ++ static unsigned long prev_jiffy; ++ ++ if (irqs_disabled()) ++ return; ++ ++ if (!IS_ENABLED(CONFIG_PREEMPT_COUNT)) ++ return; ++ ++ if (preempt_count() > preempt_offset) ++ return; ++ ++ if (time_before(jiffies, prev_jiffy + HZ) && prev_jiffy) ++ return; ++ prev_jiffy = jiffies; ++ ++ printk(KERN_ERR "BUG: assuming atomic context at %s:%d\n", file, line); ++ printk(KERN_ERR "in_atomic(): %d, irqs_disabled(): %d, pid: %d, name: %s\n", ++ in_atomic(), irqs_disabled(), ++ current->pid, current->comm); ++ ++ debug_show_held_locks(current); ++ dump_stack(); ++ add_taint(TAINT_WARN, LOCKDEP_STILL_OK); ++} ++EXPORT_SYMBOL_GPL(__cant_sleep); ++ ++void ___might_sleep(const char *file, int line, int preempt_offset) ++{ ++ /* Ratelimiting timestamp: */ ++ static unsigned long prev_jiffy; ++ ++ unsigned long preempt_disable_ip; ++ ++ /* WARN_ON_ONCE() by default, no rate limit required: */ ++ rcu_sleep_check(); ++ ++ if ((preempt_count_equals(preempt_offset) && !irqs_disabled() && ++ !is_idle_task(current) && !current->non_block_count) || ++ system_state == SYSTEM_BOOTING || system_state > SYSTEM_RUNNING || ++ oops_in_progress) ++ return; ++ ++ if (time_before(jiffies, prev_jiffy + HZ) && prev_jiffy) ++ return; ++ prev_jiffy = jiffies; ++ ++ /* Save this before calling printk(), since that will clobber it: */ ++ preempt_disable_ip = get_preempt_disable_ip(current); ++ ++ printk(KERN_ERR ++ "BUG: sleeping function called from invalid context at %s:%d\n", ++ file, line); ++ printk(KERN_ERR ++ "in_atomic(): %d, irqs_disabled(): %d, non_block: %d, pid: %d, name: %s\n", ++ in_atomic(), irqs_disabled(), current->non_block_count, ++ current->pid, current->comm); ++ ++ if (task_stack_end_corrupted(current)) ++ printk(KERN_EMERG "Thread overran stack, or stack corrupted\n"); ++ ++ debug_show_held_locks(current); ++ if (irqs_disabled()) ++ print_irqtrace_events(current); ++ if (IS_ENABLED(CONFIG_DEBUG_PREEMPT) ++ && !preempt_count_equals(preempt_offset)) { ++ pr_err("Preemption disabled at:"); ++ print_ip_sym(preempt_disable_ip); ++ pr_cont("\n"); ++ } ++ dump_stack(); ++ add_taint(TAINT_WARN, LOCKDEP_STILL_OK); ++} ++EXPORT_SYMBOL(___might_sleep); ++#endif ++ ++#ifdef CONFIG_MAGIC_SYSRQ ++static inline void normalise_rt_tasks(void) ++{ ++ struct sched_attr attr = {}; ++ struct task_struct *g, *p; ++ struct rq_flags rf; ++ struct rq *rq; ++ ++ read_lock(&tasklist_lock); ++ for_each_process_thread(g, p) { ++ /* ++ * Only normalize user tasks: ++ */ ++ if (p->flags & PF_KTHREAD) ++ continue; ++ ++ if (!rt_task(p) && !iso_task(p)) ++ continue; ++ ++ rq = task_rq_lock(p, &rf); ++ __setscheduler(p, rq, SCHED_NORMAL, 0, &attr, false); ++ task_rq_unlock(rq, p, &rf); ++ } ++ read_unlock(&tasklist_lock); ++} ++ ++void normalize_rt_tasks(void) ++{ ++ normalise_rt_tasks(); ++} ++#endif /* CONFIG_MAGIC_SYSRQ */ ++ ++#if defined(CONFIG_IA64) || defined(CONFIG_KGDB_KDB) ++/* ++ * These functions are only useful for the IA64 MCA handling, or kdb. ++ * ++ * They can only be called when the whole system has been ++ * stopped - every CPU needs to be quiescent, and no scheduling ++ * activity can take place. Using them for anything else would ++ * be a serious bug, and as a result, they aren't even visible ++ * under any other configuration. ++ */ ++ ++/** ++ * curr_task - return the current task for a given CPU. ++ * @cpu: the processor in question. ++ * ++ * ONLY VALID WHEN THE WHOLE SYSTEM IS STOPPED! ++ * ++ * Return: The current task for @cpu. ++ */ ++struct task_struct *curr_task(int cpu) ++{ ++ return cpu_curr(cpu); ++} ++ ++#endif /* defined(CONFIG_IA64) || defined(CONFIG_KGDB_KDB) */ ++ ++#ifdef CONFIG_IA64 ++/** ++ * ia64_set_curr_task - set the current task for a given CPU. ++ * @cpu: the processor in question. ++ * @p: the task pointer to set. ++ * ++ * Description: This function must only be used when non-maskable interrupts ++ * are serviced on a separate stack. It allows the architecture to switch the ++ * notion of the current task on a CPU in a non-blocking manner. This function ++ * must be called with all CPU's synchronised, and interrupts disabled, the ++ * and caller must save the original value of the current task (see ++ * curr_task() above) and restore that value before reenabling interrupts and ++ * re-starting the system. ++ * ++ * ONLY VALID WHEN THE WHOLE SYSTEM IS STOPPED! ++ */ ++void ia64_set_curr_task(int cpu, struct task_struct *p) ++{ ++ cpu_curr(cpu) = p; ++} ++ ++#endif ++ ++void init_idle_bootup_task(struct task_struct *idle) ++{} ++ ++#ifdef CONFIG_SCHED_DEBUG ++__read_mostly bool sched_debug_enabled; ++ ++void proc_sched_show_task(struct task_struct *p, struct pid_namespace *ns, ++ struct seq_file *m) ++{ ++ seq_printf(m, "%s (%d, #threads: %d)\n", p->comm, task_pid_nr_ns(p, ns), ++ get_nr_threads(p)); ++} ++ ++void proc_sched_set_task(struct task_struct *p) ++{} ++#endif ++ ++#ifdef CONFIG_CGROUP_SCHED ++static void sched_free_group(struct task_group *tg) ++{ ++ kmem_cache_free(task_group_cache, tg); ++} ++ ++/* allocate runqueue etc for a new task group */ ++struct task_group *sched_create_group(struct task_group *parent) ++{ ++ struct task_group *tg; ++ ++ tg = kmem_cache_alloc(task_group_cache, GFP_KERNEL | __GFP_ZERO); ++ if (!tg) ++ return ERR_PTR(-ENOMEM); ++ ++ return tg; ++} ++ ++void sched_online_group(struct task_group *tg, struct task_group *parent) ++{ ++} ++ ++/* rcu callback to free various structures associated with a task group */ ++static void sched_free_group_rcu(struct rcu_head *rhp) ++{ ++ /* Now it should be safe to free those cfs_rqs */ ++ sched_free_group(container_of(rhp, struct task_group, rcu)); ++} ++ ++void sched_destroy_group(struct task_group *tg) ++{ ++ /* Wait for possible concurrent references to cfs_rqs complete */ ++ call_rcu(&tg->rcu, sched_free_group_rcu); ++} ++ ++void sched_offline_group(struct task_group *tg) ++{ ++} ++ ++static inline struct task_group *css_tg(struct cgroup_subsys_state *css) ++{ ++ return css ? container_of(css, struct task_group, css) : NULL; ++} ++ ++static struct cgroup_subsys_state * ++cpu_cgroup_css_alloc(struct cgroup_subsys_state *parent_css) ++{ ++ struct task_group *parent = css_tg(parent_css); ++ struct task_group *tg; ++ ++ if (!parent) { ++ /* This is early initialization for the top cgroup */ ++ return &root_task_group.css; ++ } ++ ++ tg = sched_create_group(parent); ++ if (IS_ERR(tg)) ++ return ERR_PTR(-ENOMEM); ++ return &tg->css; ++} ++ ++/* Expose task group only after completing cgroup initialization */ ++static int cpu_cgroup_css_online(struct cgroup_subsys_state *css) ++{ ++ struct task_group *tg = css_tg(css); ++ struct task_group *parent = css_tg(css->parent); ++ ++ if (parent) ++ sched_online_group(tg, parent); ++ return 0; ++} ++ ++static void cpu_cgroup_css_released(struct cgroup_subsys_state *css) ++{ ++ struct task_group *tg = css_tg(css); ++ ++ sched_offline_group(tg); ++} ++ ++static void cpu_cgroup_css_free(struct cgroup_subsys_state *css) ++{ ++ struct task_group *tg = css_tg(css); ++ ++ /* ++ * Relies on the RCU grace period between css_released() and this. ++ */ ++ sched_free_group(tg); ++} ++ ++static void cpu_cgroup_fork(struct task_struct *task) ++{ ++} ++ ++static int cpu_cgroup_can_attach(struct cgroup_taskset *tset) ++{ ++ return 0; ++} ++ ++static void cpu_cgroup_attach(struct cgroup_taskset *tset) ++{ ++} ++ ++static struct cftype cpu_legacy_files[] = { ++ { } /* Terminate */ ++}; ++ ++static struct cftype cpu_files[] = { ++ { } /* terminate */ ++}; ++ ++static int cpu_extra_stat_show(struct seq_file *sf, ++ struct cgroup_subsys_state *css) ++{ ++ return 0; ++} ++ ++struct cgroup_subsys cpu_cgrp_subsys = { ++ .css_alloc = cpu_cgroup_css_alloc, ++ .css_online = cpu_cgroup_css_online, ++ .css_released = cpu_cgroup_css_released, ++ .css_free = cpu_cgroup_css_free, ++ .css_extra_stat_show = cpu_extra_stat_show, ++ .fork = cpu_cgroup_fork, ++ .can_attach = cpu_cgroup_can_attach, ++ .attach = cpu_cgroup_attach, ++ .legacy_cftypes = cpu_files, ++ .legacy_cftypes = cpu_legacy_files, ++ .dfl_cftypes = cpu_files, ++ .early_init = true, ++ .threaded = true, ++}; ++#endif /* CONFIG_CGROUP_SCHED */ ++ ++#undef CREATE_TRACE_POINTS +diff --git a/kernel/sched/MuQSS.h b/kernel/sched/MuQSS.h +new file mode 100644 +index 000000000000..5214b158d82f +--- /dev/null ++++ b/kernel/sched/MuQSS.h +@@ -0,0 +1,1005 @@ ++/* SPDX-License-Identifier: GPL-2.0 */ ++#ifndef MUQSS_SCHED_H ++#define MUQSS_SCHED_H ++ ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++ ++#include ++ ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++ ++#ifdef CONFIG_PARAVIRT ++#include ++#endif ++ ++#include "cpupri.h" ++ ++#ifdef CONFIG_SCHED_DEBUG ++# define SCHED_WARN_ON(x) WARN_ONCE(x, #x) ++#else ++# define SCHED_WARN_ON(x) ((void)(x)) ++#endif ++ ++/* task_struct::on_rq states: */ ++#define TASK_ON_RQ_QUEUED 1 ++#define TASK_ON_RQ_MIGRATING 2 ++ ++struct rq; ++ ++#ifdef CONFIG_SMP ++ ++static inline bool sched_asym_prefer(int a, int b) ++{ ++ return arch_asym_cpu_priority(a) > arch_asym_cpu_priority(b); ++} ++ ++struct perf_domain { ++ struct em_perf_domain *em_pd; ++ struct perf_domain *next; ++ struct rcu_head rcu; ++}; ++ ++/* Scheduling group status flags */ ++#define SG_OVERLOAD 0x1 /* More than one runnable task on a CPU. */ ++#define SG_OVERUTILIZED 0x2 /* One or more CPUs are over-utilized. */ ++ ++/* ++ * We add the notion of a root-domain which will be used to define per-domain ++ * variables. Each exclusive cpuset essentially defines an island domain by ++ * fully partitioning the member cpus from any other cpuset. Whenever a new ++ * exclusive cpuset is created, we also create and attach a new root-domain ++ * object. ++ * ++ */ ++struct root_domain { ++ atomic_t refcount; ++ atomic_t rto_count; ++ struct rcu_head rcu; ++ cpumask_var_t span; ++ cpumask_var_t online; ++ ++ /* ++ * Indicate pullable load on at least one CPU, e.g: ++ * - More than one runnable task ++ * - Running task is misfit ++ */ ++ int overload; ++ ++ /* Indicate one or more cpus over-utilized (tipping point) */ ++ int overutilized; ++ ++ /* ++ * The bit corresponding to a CPU gets set here if such CPU has more ++ * than one runnable -deadline task (as it is below for RT tasks). ++ */ ++ cpumask_var_t dlo_mask; ++ atomic_t dlo_count; ++ /* Replace unused CFS structures with void */ ++ //struct dl_bw dl_bw; ++ //struct cpudl cpudl; ++ void *dl_bw; ++ void *cpudl; ++ ++ /* ++ * The "RT overload" flag: it gets set if a CPU has more than ++ * one runnable RT task. ++ */ ++ cpumask_var_t rto_mask; ++ //struct cpupri cpupri; ++ void *cpupri; ++ ++ unsigned long max_cpu_capacity; ++ ++ /* ++ * NULL-terminated list of performance domains intersecting with the ++ * CPUs of the rd. Protected by RCU. ++ */ ++ struct perf_domain *pd; ++}; ++ ++extern void init_defrootdomain(void); ++extern int sched_init_domains(const struct cpumask *cpu_map); ++extern void rq_attach_root(struct rq *rq, struct root_domain *rd); ++ ++static inline void cpupri_cleanup(void __maybe_unused *cpupri) ++{ ++} ++ ++static inline void cpudl_cleanup(void __maybe_unused *cpudl) ++{ ++} ++ ++static inline void init_dl_bw(void __maybe_unused *dl_bw) ++{ ++} ++ ++static inline int cpudl_init(void __maybe_unused *dl_bw) ++{ ++ return 0; ++} ++ ++static inline int cpupri_init(void __maybe_unused *cpupri) ++{ ++ return 0; ++} ++#endif /* CONFIG_SMP */ ++ ++/* ++ * This is the main, per-CPU runqueue data structure. ++ * This data should only be modified by the local cpu. ++ */ ++struct rq { ++ raw_spinlock_t *lock; ++ raw_spinlock_t *orig_lock; ++ ++ struct task_struct *curr, *idle, *stop; ++ struct mm_struct *prev_mm; ++ ++ unsigned int nr_running; ++ /* ++ * This is part of a global counter where only the total sum ++ * over all CPUs matters. A task can increase this counter on ++ * one CPU and if it got migrated afterwards it may decrease ++ * it on another CPU. Always updated under the runqueue lock: ++ */ ++ unsigned long nr_uninterruptible; ++ u64 nr_switches; ++ ++ /* Stored data about rq->curr to work outside rq lock */ ++ u64 rq_deadline; ++ int rq_prio; ++ ++ /* Best queued id for use outside lock */ ++ u64 best_key; ++ ++ unsigned long last_scheduler_tick; /* Last jiffy this RQ ticked */ ++ unsigned long last_jiffy; /* Last jiffy this RQ updated rq clock */ ++ u64 niffies; /* Last time this RQ updated rq clock */ ++ u64 last_niffy; /* Last niffies as updated by local clock */ ++ u64 last_jiffy_niffies; /* Niffies @ last_jiffy */ ++ ++ u64 load_update; /* When we last updated load */ ++ unsigned long load_avg; /* Rolling load average */ ++#ifdef CONFIG_HAVE_SCHED_AVG_IRQ ++ u64 irq_load_update; /* When we last updated IRQ load */ ++ unsigned long irq_load_avg; /* Rolling IRQ load average */ ++#endif ++#ifdef CONFIG_SMT_NICE ++ struct mm_struct *rq_mm; ++ int rq_smt_bias; /* Policy/nice level bias across smt siblings */ ++#endif ++ /* Accurate timekeeping data */ ++ unsigned long user_ns, nice_ns, irq_ns, softirq_ns, system_ns, ++ iowait_ns, idle_ns; ++ atomic_t nr_iowait; ++ ++#ifdef CONFIG_MEMBARRIER ++ int membarrier_state; ++#endif ++ ++ skiplist_node *node; ++ skiplist *sl; ++#ifdef CONFIG_SMP ++ struct task_struct *preempt; /* Preempt triggered on this task */ ++ struct task_struct *preempting; /* Hint only, what task is preempting */ ++ ++ int cpu; /* cpu of this runqueue */ ++ bool online; ++ ++ struct root_domain *rd; ++ struct sched_domain *sd; ++ ++ unsigned long cpu_capacity_orig; ++ ++ int *cpu_locality; /* CPU relative cache distance */ ++ struct rq **rq_order; /* Shared RQs ordered by relative cache distance */ ++ struct rq **cpu_order; /* RQs of discrete CPUs ordered by distance */ ++ ++ struct rq *smp_leader; /* First physical CPU per node */ ++#ifdef CONFIG_SCHED_SMT ++ struct rq *smt_leader; /* First logical CPU in SMT siblings */ ++ cpumask_t thread_mask; ++ bool (*siblings_idle)(struct rq *rq); ++ /* See if all smt siblings are idle */ ++#endif /* CONFIG_SCHED_SMT */ ++#ifdef CONFIG_SCHED_MC ++ struct rq *mc_leader; /* First logical CPU in MC siblings */ ++ cpumask_t core_mask; ++ bool (*cache_idle)(struct rq *rq); ++ /* See if all cache siblings are idle */ ++#endif /* CONFIG_SCHED_MC */ ++#endif /* CONFIG_SMP */ ++#ifdef CONFIG_IRQ_TIME_ACCOUNTING ++ u64 prev_irq_time; ++#endif /* CONFIG_IRQ_TIME_ACCOUNTING */ ++#ifdef CONFIG_PARAVIRT ++ u64 prev_steal_time; ++#endif /* CONFIG_PARAVIRT */ ++#ifdef CONFIG_PARAVIRT_TIME_ACCOUNTING ++ u64 prev_steal_time_rq; ++#endif /* CONFIG_PARAVIRT_TIME_ACCOUNTING */ ++ ++ u64 clock, old_clock, last_tick; ++ /* Ensure that all clocks are in the same cache line */ ++ u64 clock_task ____cacheline_aligned; ++ int dither; ++ ++ int iso_ticks; ++ bool iso_refractory; ++ ++#ifdef CONFIG_HIGH_RES_TIMERS ++ struct hrtimer hrexpiry_timer; ++#endif ++ ++ int rt_nr_running; /* Number real time tasks running */ ++#ifdef CONFIG_SCHEDSTATS ++ ++ /* latency stats */ ++ struct sched_info rq_sched_info; ++ unsigned long long rq_cpu_time; ++ /* could above be rq->cfs_rq.exec_clock + rq->rt_rq.rt_runtime ? */ ++ ++ /* sys_sched_yield() stats */ ++ unsigned int yld_count; ++ ++ /* schedule() stats */ ++ unsigned int sched_switch; ++ unsigned int sched_count; ++ unsigned int sched_goidle; ++ ++ /* try_to_wake_up() stats */ ++ unsigned int ttwu_count; ++ unsigned int ttwu_local; ++#endif /* CONFIG_SCHEDSTATS */ ++ ++#ifdef CONFIG_SMP ++ struct llist_head wake_list; ++#endif ++ ++#ifdef CONFIG_CPU_IDLE ++ /* Must be inspected within a rcu lock section */ ++ struct cpuidle_state *idle_state; ++#endif ++}; ++ ++struct rq_flags { ++ unsigned long flags; ++}; ++ ++#ifdef CONFIG_SMP ++struct rq *cpu_rq(int cpu); ++#endif ++ ++#ifndef CONFIG_SMP ++extern struct rq *uprq; ++#define cpu_rq(cpu) (uprq) ++#define this_rq() (uprq) ++#define raw_rq() (uprq) ++#define task_rq(p) (uprq) ++#define cpu_curr(cpu) ((uprq)->curr) ++#else /* CONFIG_SMP */ ++DECLARE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues); ++#define this_rq() this_cpu_ptr(&runqueues) ++#define raw_rq() raw_cpu_ptr(&runqueues) ++#define task_rq(p) cpu_rq(task_cpu(p)) ++#endif /* CONFIG_SMP */ ++ ++static inline int task_current(struct rq *rq, struct task_struct *p) ++{ ++ return rq->curr == p; ++} ++ ++static inline int task_running(struct rq *rq, struct task_struct *p) ++{ ++#ifdef CONFIG_SMP ++ return p->on_cpu; ++#else ++ return task_current(rq, p); ++#endif ++} ++ ++static inline int task_on_rq_queued(struct task_struct *p) ++{ ++ return p->on_rq == TASK_ON_RQ_QUEUED; ++} ++ ++static inline int task_on_rq_migrating(struct task_struct *p) ++{ ++ return READ_ONCE(p->on_rq) == TASK_ON_RQ_MIGRATING; ++} ++ ++static inline void rq_lock(struct rq *rq) ++ __acquires(rq->lock) ++{ ++ raw_spin_lock(rq->lock); ++} ++ ++static inline void rq_unlock(struct rq *rq) ++ __releases(rq->lock) ++{ ++ raw_spin_unlock(rq->lock); ++} ++ ++static inline void rq_lock_irq(struct rq *rq) ++ __acquires(rq->lock) ++{ ++ raw_spin_lock_irq(rq->lock); ++} ++ ++static inline void rq_unlock_irq(struct rq *rq, struct rq_flags __always_unused *rf) ++ __releases(rq->lock) ++{ ++ raw_spin_unlock_irq(rq->lock); ++} ++ ++static inline void rq_lock_irqsave(struct rq *rq, struct rq_flags *rf) ++ __acquires(rq->lock) ++{ ++ raw_spin_lock_irqsave(rq->lock, rf->flags); ++} ++ ++static inline void rq_unlock_irqrestore(struct rq *rq, struct rq_flags *rf) ++ __releases(rq->lock) ++{ ++ raw_spin_unlock_irqrestore(rq->lock, rf->flags); ++} ++ ++static inline struct rq *task_rq_lock(struct task_struct *p, struct rq_flags *rf) ++ __acquires(p->pi_lock) ++ __acquires(rq->lock) ++{ ++ struct rq *rq; ++ ++ while (42) { ++ raw_spin_lock_irqsave(&p->pi_lock, rf->flags); ++ rq = task_rq(p); ++ raw_spin_lock(rq->lock); ++ if (likely(rq == task_rq(p))) ++ break; ++ raw_spin_unlock(rq->lock); ++ raw_spin_unlock_irqrestore(&p->pi_lock, rf->flags); ++ } ++ return rq; ++} ++ ++static inline void task_rq_unlock(struct rq *rq, struct task_struct *p, struct rq_flags *rf) ++ __releases(rq->lock) ++ __releases(p->pi_lock) ++{ ++ rq_unlock(rq); ++ raw_spin_unlock_irqrestore(&p->pi_lock, rf->flags); ++} ++ ++static inline struct rq *__task_rq_lock(struct task_struct *p, struct rq_flags __always_unused *rf) ++ __acquires(rq->lock) ++{ ++ struct rq *rq; ++ ++ lockdep_assert_held(&p->pi_lock); ++ ++ while (42) { ++ rq = task_rq(p); ++ raw_spin_lock(rq->lock); ++ if (likely(rq == task_rq(p))) ++ break; ++ raw_spin_unlock(rq->lock); ++ } ++ return rq; ++} ++ ++static inline void __task_rq_unlock(struct rq *rq, struct rq_flags __always_unused *rf) ++{ ++ rq_unlock(rq); ++} ++ ++static inline struct rq * ++this_rq_lock_irq(struct rq_flags *rf) ++ __acquires(rq->lock) ++{ ++ struct rq *rq; ++ ++ local_irq_disable(); ++ rq = this_rq(); ++ rq_lock(rq); ++ return rq; ++} ++ ++/* ++ * {de,en}queue flags: Most not used on MuQSS. ++ * ++ * DEQUEUE_SLEEP - task is no longer runnable ++ * ENQUEUE_WAKEUP - task just became runnable ++ * ++ * SAVE/RESTORE - an otherwise spurious dequeue/enqueue, done to ensure tasks ++ * are in a known state which allows modification. Such pairs ++ * should preserve as much state as possible. ++ * ++ * MOVE - paired with SAVE/RESTORE, explicitly does not preserve the location ++ * in the runqueue. ++ * ++ * ENQUEUE_HEAD - place at front of runqueue (tail if not specified) ++ * ENQUEUE_REPLENISH - CBS (replenish runtime and postpone deadline) ++ * ENQUEUE_MIGRATED - the task was migrated during wakeup ++ * ++ */ ++ ++#define DEQUEUE_SLEEP 0x01 ++#define DEQUEUE_SAVE 0x02 /* matches ENQUEUE_RESTORE */ ++ ++#define ENQUEUE_WAKEUP 0x01 ++#define ENQUEUE_RESTORE 0x02 ++ ++#ifdef CONFIG_SMP ++#define ENQUEUE_MIGRATED 0x40 ++#else ++#define ENQUEUE_MIGRATED 0x00 ++#endif ++ ++static inline u64 __rq_clock_broken(struct rq *rq) ++{ ++ return READ_ONCE(rq->clock); ++} ++ ++static inline u64 rq_clock(struct rq *rq) ++{ ++ lockdep_assert_held(rq->lock); ++ ++ return rq->clock; ++} ++ ++static inline u64 rq_clock_task(struct rq *rq) ++{ ++ lockdep_assert_held(rq->lock); ++ ++ return rq->clock_task; ++} ++ ++#ifdef CONFIG_NUMA ++enum numa_topology_type { ++ NUMA_DIRECT, ++ NUMA_GLUELESS_MESH, ++ NUMA_BACKPLANE, ++}; ++extern enum numa_topology_type sched_numa_topology_type; ++extern int sched_max_numa_distance; ++extern bool find_numa_distance(int distance); ++extern void sched_init_numa(void); ++extern void sched_domains_numa_masks_set(unsigned int cpu); ++extern void sched_domains_numa_masks_clear(unsigned int cpu); ++extern int sched_numa_find_closest(const struct cpumask *cpus, int cpu); ++#else ++static inline void sched_init_numa(void) { } ++static inline void sched_domains_numa_masks_set(unsigned int cpu) { } ++static inline void sched_domains_numa_masks_clear(unsigned int cpu) { } ++static inline int sched_numa_find_closest(const struct cpumask *cpus, int cpu) ++{ ++ return nr_cpu_ids; ++} ++#endif ++ ++extern struct mutex sched_domains_mutex; ++extern struct static_key_false sched_schedstats; ++ ++#define rcu_dereference_check_sched_domain(p) \ ++ rcu_dereference_check((p), \ ++ lockdep_is_held(&sched_domains_mutex)) ++ ++#ifdef CONFIG_SMP ++ ++/* ++ * The domain tree (rq->sd) is protected by RCU's quiescent state transition. ++ * See destroy_sched_domains: call_rcu for details. ++ * ++ * The domain tree of any CPU may only be accessed from within ++ * preempt-disabled sections. ++ */ ++#define for_each_domain(cpu, __sd) \ ++ for (__sd = rcu_dereference_check_sched_domain(cpu_rq(cpu)->sd); \ ++ __sd; __sd = __sd->parent) ++ ++#define for_each_lower_domain(sd) for (; sd; sd = sd->child) ++ ++/** ++ * highest_flag_domain - Return highest sched_domain containing flag. ++ * @cpu: The cpu whose highest level of sched domain is to ++ * be returned. ++ * @flag: The flag to check for the highest sched_domain ++ * for the given cpu. ++ * ++ * Returns the highest sched_domain of a cpu which contains the given flag. ++ */ ++static inline struct sched_domain *highest_flag_domain(int cpu, int flag) ++{ ++ struct sched_domain *sd, *hsd = NULL; ++ ++ for_each_domain(cpu, sd) { ++ if (!(sd->flags & flag)) ++ break; ++ hsd = sd; ++ } ++ ++ return hsd; ++} ++ ++static inline struct sched_domain *lowest_flag_domain(int cpu, int flag) ++{ ++ struct sched_domain *sd; ++ ++ for_each_domain(cpu, sd) { ++ if (sd->flags & flag) ++ break; ++ } ++ ++ return sd; ++} ++ ++DECLARE_PER_CPU(struct sched_domain *, sd_llc); ++DECLARE_PER_CPU(int, sd_llc_size); ++DECLARE_PER_CPU(int, sd_llc_id); ++DECLARE_PER_CPU(struct sched_domain_shared *, sd_llc_shared); ++DECLARE_PER_CPU(struct sched_domain *, sd_numa); ++DECLARE_PER_CPU(struct sched_domain *, sd_asym_packing); ++DECLARE_PER_CPU(struct sched_domain *, sd_asym_cpucapacity); ++ ++struct sched_group_capacity { ++ atomic_t ref; ++ /* ++ * CPU capacity of this group, SCHED_CAPACITY_SCALE being max capacity ++ * for a single CPU. ++ */ ++ unsigned long capacity; ++ unsigned long min_capacity; /* Min per-CPU capacity in group */ ++ unsigned long max_capacity; /* Max per-CPU capacity in group */ ++ unsigned long next_update; ++ int imbalance; /* XXX unrelated to capacity but shared group state */ ++ ++#ifdef CONFIG_SCHED_DEBUG ++ int id; ++#endif ++ ++ unsigned long cpumask[0]; /* balance mask */ ++}; ++ ++struct sched_group { ++ struct sched_group *next; /* Must be a circular list */ ++ atomic_t ref; ++ ++ unsigned int group_weight; ++ struct sched_group_capacity *sgc; ++ int asym_prefer_cpu; /* cpu of highest priority in group */ ++ ++ /* ++ * The CPUs this group covers. ++ * ++ * NOTE: this field is variable length. (Allocated dynamically ++ * by attaching extra space to the end of the structure, ++ * depending on how many CPUs the kernel has booted up with) ++ */ ++ unsigned long cpumask[0]; ++}; ++ ++static inline struct cpumask *sched_group_span(struct sched_group *sg) ++{ ++ return to_cpumask(sg->cpumask); ++} ++ ++/* ++ * See build_balance_mask(). ++ */ ++static inline struct cpumask *group_balance_mask(struct sched_group *sg) ++{ ++ return to_cpumask(sg->sgc->cpumask); ++} ++ ++/** ++ * group_first_cpu - Returns the first cpu in the cpumask of a sched_group. ++ * @group: The group whose first cpu is to be returned. ++ */ ++static inline unsigned int group_first_cpu(struct sched_group *group) ++{ ++ return cpumask_first(sched_group_span(group)); ++} ++ ++ ++#if defined(CONFIG_SCHED_DEBUG) && defined(CONFIG_SYSCTL) ++void register_sched_domain_sysctl(void); ++void dirty_sched_domain_sysctl(int cpu); ++void unregister_sched_domain_sysctl(void); ++#else ++static inline void register_sched_domain_sysctl(void) ++{ ++} ++static inline void dirty_sched_domain_sysctl(int cpu) ++{ ++} ++static inline void unregister_sched_domain_sysctl(void) ++{ ++} ++#endif ++ ++extern void sched_ttwu_pending(void); ++extern void set_cpus_allowed_common(struct task_struct *p, const struct cpumask *new_mask); ++extern void set_rq_online (struct rq *rq); ++extern void set_rq_offline(struct rq *rq); ++extern bool sched_smp_initialized; ++ ++static inline void update_group_capacity(struct sched_domain *sd, int cpu) ++{ ++} ++ ++static inline void trigger_load_balance(struct rq *rq) ++{ ++} ++ ++#define sched_feat(x) 0 ++ ++#else /* CONFIG_SMP */ ++ ++static inline void sched_ttwu_pending(void) { } ++ ++#endif /* CONFIG_SMP */ ++ ++#ifdef CONFIG_CPU_IDLE ++static inline void idle_set_state(struct rq *rq, ++ struct cpuidle_state *idle_state) ++{ ++ rq->idle_state = idle_state; ++} ++ ++static inline struct cpuidle_state *idle_get_state(struct rq *rq) ++{ ++ SCHED_WARN_ON(!rcu_read_lock_held()); ++ return rq->idle_state; ++} ++#else ++static inline void idle_set_state(struct rq *rq, ++ struct cpuidle_state *idle_state) ++{ ++} ++ ++static inline struct cpuidle_state *idle_get_state(struct rq *rq) ++{ ++ return NULL; ++} ++#endif ++ ++#ifdef CONFIG_SCHED_DEBUG ++extern bool sched_debug_enabled; ++#endif ++ ++extern void schedule_idle(void); ++ ++#ifdef CONFIG_IRQ_TIME_ACCOUNTING ++struct irqtime { ++ u64 total; ++ u64 tick_delta; ++ u64 irq_start_time; ++ struct u64_stats_sync sync; ++}; ++ ++DECLARE_PER_CPU(struct irqtime, cpu_irqtime); ++ ++/* ++ * Returns the irqtime minus the softirq time computed by ksoftirqd. ++ * Otherwise ksoftirqd's sum_exec_runtime is substracted its own runtime ++ * and never move forward. ++ */ ++static inline u64 irq_time_read(int cpu) ++{ ++ struct irqtime *irqtime = &per_cpu(cpu_irqtime, cpu); ++ unsigned int seq; ++ u64 total; ++ ++ do { ++ seq = __u64_stats_fetch_begin(&irqtime->sync); ++ total = irqtime->total; ++ } while (__u64_stats_fetch_retry(&irqtime->sync, seq)); ++ ++ return total; ++} ++#endif /* CONFIG_IRQ_TIME_ACCOUNTING */ ++ ++static inline bool sched_stop_runnable(struct rq *rq) ++{ ++ return rq->stop && task_on_rq_queued(rq->stop); ++} ++ ++#ifdef CONFIG_SMP ++static inline int cpu_of(struct rq *rq) ++{ ++ return rq->cpu; ++} ++#else /* CONFIG_SMP */ ++static inline int cpu_of(struct rq *rq) ++{ ++ return 0; ++} ++#endif ++ ++#ifdef CONFIG_CPU_FREQ ++DECLARE_PER_CPU(struct update_util_data *, cpufreq_update_util_data); ++ ++static inline void cpufreq_trigger(struct rq *rq, unsigned int flags) ++{ ++ struct update_util_data *data; ++ ++ data = rcu_dereference_sched(*per_cpu_ptr(&cpufreq_update_util_data, ++ cpu_of(rq))); ++ ++ if (data) ++ data->func(data, rq->niffies, flags); ++} ++#else ++static inline void cpufreq_trigger(struct rq *rq, unsigned int flag) ++{ ++} ++#endif /* CONFIG_CPU_FREQ */ ++ ++static __always_inline ++unsigned int uclamp_util_with(struct rq __maybe_unused *rq, unsigned int util, ++ struct task_struct __maybe_unused *p) ++{ ++ return util; ++} ++ ++static inline unsigned int uclamp_util(struct rq *rq, unsigned int util) ++{ ++ return util; ++} ++ ++#ifdef arch_scale_freq_capacity ++#ifndef arch_scale_freq_invariant ++#define arch_scale_freq_invariant() (true) ++#endif ++#else /* arch_scale_freq_capacity */ ++#define arch_scale_freq_invariant() (false) ++#endif ++ ++/* ++ * This should only be called when current == rq->idle. Dodgy workaround for ++ * when softirqs are pending and we are in the idle loop. Setting current to ++ * resched will kick us out of the idle loop and the softirqs will be serviced ++ * on our next pass through schedule(). ++ */ ++static inline bool softirq_pending(int cpu) ++{ ++ if (likely(!local_softirq_pending())) ++ return false; ++ set_tsk_need_resched(current); ++ return true; ++} ++ ++#ifdef CONFIG_64BIT ++static inline u64 read_sum_exec_runtime(struct task_struct *t) ++{ ++ return tsk_seruntime(t); ++} ++#else ++static inline u64 read_sum_exec_runtime(struct task_struct *t) ++{ ++ struct rq_flags rf; ++ u64 ns; ++ struct rq *rq; ++ ++ rq = task_rq_lock(t, &rf); ++ ns = tsk_seruntime(t); ++ task_rq_unlock(rq, t, &rf); ++ ++ return ns; ++} ++#endif ++ ++#ifndef arch_scale_freq_capacity ++static __always_inline ++unsigned long arch_scale_freq_capacity(int cpu) ++{ ++ return SCHED_CAPACITY_SCALE; ++} ++#endif ++ ++#ifdef CONFIG_NO_HZ_FULL ++extern bool sched_can_stop_tick(struct rq *rq); ++extern int __init sched_tick_offload_init(void); ++ ++/* ++ * Tick may be needed by tasks in the runqueue depending on their policy and ++ * requirements. If tick is needed, lets send the target an IPI to kick it out of ++ * nohz mode if necessary. ++ */ ++static inline void sched_update_tick_dependency(struct rq *rq) ++{ ++ int cpu; ++ ++ if (!tick_nohz_full_enabled()) ++ return; ++ ++ cpu = cpu_of(rq); ++ ++ if (!tick_nohz_full_cpu(cpu)) ++ return; ++ ++ if (sched_can_stop_tick(rq)) ++ tick_nohz_dep_clear_cpu(cpu, TICK_DEP_BIT_SCHED); ++ else ++ tick_nohz_dep_set_cpu(cpu, TICK_DEP_BIT_SCHED); ++} ++#else ++static inline int sched_tick_offload_init(void) { return 0; } ++static inline void sched_update_tick_dependency(struct rq *rq) { } ++#endif ++ ++#define SCHED_FLAG_SUGOV 0x10000000 ++ ++static inline bool rt_rq_is_runnable(struct rq *rt_rq) ++{ ++ return rt_rq->rt_nr_running; ++} ++ ++/** ++ * enum schedutil_type - CPU utilization type ++ * @FREQUENCY_UTIL: Utilization used to select frequency ++ * @ENERGY_UTIL: Utilization used during energy calculation ++ * ++ * The utilization signals of all scheduling classes (CFS/RT/DL) and IRQ time ++ * need to be aggregated differently depending on the usage made of them. This ++ * enum is used within schedutil_freq_util() to differentiate the types of ++ * utilization expected by the callers, and adjust the aggregation accordingly. ++ */ ++enum schedutil_type { ++ FREQUENCY_UTIL, ++ ENERGY_UTIL, ++}; ++ ++#ifdef CONFIG_CPU_FREQ_GOV_SCHEDUTIL ++ ++unsigned long schedutil_cpu_util(int cpu, unsigned long util_cfs, ++ unsigned long max, enum schedutil_type type, ++ struct task_struct *p); ++ ++static inline unsigned long cpu_bw_dl(struct rq *rq) ++{ ++ return 0; ++} ++ ++static inline unsigned long cpu_util_dl(struct rq *rq) ++{ ++ return 0; ++} ++ ++static inline unsigned long cpu_util_cfs(struct rq *rq) ++{ ++ unsigned long ret = READ_ONCE(rq->load_avg); ++ ++ if (ret > SCHED_CAPACITY_SCALE) ++ ret = SCHED_CAPACITY_SCALE; ++ return ret; ++} ++ ++static inline unsigned long cpu_util_rt(struct rq *rq) ++{ ++ unsigned long ret = READ_ONCE(rq->rt_nr_running); ++ ++ if (ret > SCHED_CAPACITY_SCALE) ++ ret = SCHED_CAPACITY_SCALE; ++ return ret; ++} ++ ++#ifdef CONFIG_HAVE_SCHED_AVG_IRQ ++static inline unsigned long cpu_util_irq(struct rq *rq) ++{ ++ unsigned long ret = READ_ONCE(rq->irq_load_avg); ++ ++ if (ret > SCHED_CAPACITY_SCALE) ++ ret = SCHED_CAPACITY_SCALE; ++ return ret; ++} ++ ++static inline ++unsigned long scale_irq_capacity(unsigned long util, unsigned long irq, unsigned long max) ++{ ++ util *= (max - irq); ++ util /= max; ++ ++ return util; ++ ++} ++#else ++static inline unsigned long cpu_util_irq(struct rq *rq) ++{ ++ return 0; ++} ++ ++static inline ++unsigned long scale_irq_capacity(unsigned long util, unsigned long irq, unsigned long max) ++{ ++ return util; ++} ++#endif ++#endif ++ ++#if defined(CONFIG_ENERGY_MODEL) && defined(CONFIG_CPU_FREQ_GOV_SCHEDUTIL) ++#define perf_domain_span(pd) (to_cpumask(((pd)->em_pd->cpus))) ++ ++DECLARE_STATIC_KEY_FALSE(sched_energy_present); ++ ++static inline bool sched_energy_enabled(void) ++{ ++ return static_branch_unlikely(&sched_energy_present); ++} ++ ++#else /* ! (CONFIG_ENERGY_MODEL && CONFIG_CPU_FREQ_GOV_SCHEDUTIL) */ ++ ++#define perf_domain_span(pd) NULL ++static inline bool sched_energy_enabled(void) { return false; } ++ ++#endif /* CONFIG_ENERGY_MODEL && CONFIG_CPU_FREQ_GOV_SCHEDUTIL */ ++ ++#ifdef CONFIG_MEMBARRIER ++/* ++ * The scheduler provides memory barriers required by membarrier between: ++ * - prior user-space memory accesses and store to rq->membarrier_state, ++ * - store to rq->membarrier_state and following user-space memory accesses. ++ * In the same way it provides those guarantees around store to rq->curr. ++ */ ++static inline void membarrier_switch_mm(struct rq *rq, ++ struct mm_struct *prev_mm, ++ struct mm_struct *next_mm) ++{ ++ int membarrier_state; ++ ++ if (prev_mm == next_mm) ++ return; ++ ++ membarrier_state = atomic_read(&next_mm->membarrier_state); ++ if (READ_ONCE(rq->membarrier_state) == membarrier_state) ++ return; ++ ++ WRITE_ONCE(rq->membarrier_state, membarrier_state); ++} ++#else ++static inline void membarrier_switch_mm(struct rq *rq, ++ struct mm_struct *prev_mm, ++ struct mm_struct *next_mm) ++{ ++} ++#endif ++ ++#endif /* MUQSS_SCHED_H */ +diff --git a/kernel/sched/cpufreq_schedutil.c b/kernel/sched/cpufreq_schedutil.c +index 86800b4d5453..f3d8dca0538a 100644 +--- a/kernel/sched/cpufreq_schedutil.c ++++ b/kernel/sched/cpufreq_schedutil.c +@@ -185,6 +185,12 @@ static unsigned int get_next_freq(struct sugov_policy *sg_policy, + return cpufreq_driver_resolve_freq(policy, freq); + } + ++#ifdef CONFIG_SCHED_MUQSS ++#define rt_rq_runnable(rq_rt) rt_rq_is_runnable(rq) ++#else ++#define rt_rq_runnable(rq_rt) rt_rq_is_runnable(&rq->rt) ++#endif ++ + /* + * This function computes an effective utilization for the given CPU, to be + * used for frequency selection given the linear relation: f = u * f_max. +@@ -213,7 +219,7 @@ unsigned long schedutil_cpu_util(int cpu, unsigned long util_cfs, + struct rq *rq = cpu_rq(cpu); + + if (!IS_BUILTIN(CONFIG_UCLAMP_TASK) && +- type == FREQUENCY_UTIL && rt_rq_is_runnable(&rq->rt)) { ++ type == FREQUENCY_UTIL && rt_rq_runnable(rq)) { + return max; + } + +@@ -658,7 +664,11 @@ static int sugov_kthread_create(struct sugov_policy *sg_policy) + struct task_struct *thread; + struct sched_attr attr = { + .size = sizeof(struct sched_attr), ++#ifdef CONFIG_SCHED_MUQSS ++ .sched_policy = SCHED_RR, ++#else + .sched_policy = SCHED_DEADLINE, ++#endif + .sched_flags = SCHED_FLAG_SUGOV, + .sched_nice = 0, + .sched_priority = 0, +diff --git a/kernel/sched/cpupri.h b/kernel/sched/cpupri.h +index 7dc20a3232e7..e733a0a53b0a 100644 +--- a/kernel/sched/cpupri.h ++++ b/kernel/sched/cpupri.h +@@ -17,9 +17,11 @@ struct cpupri { + int *cpu_to_pri; + }; + ++#ifndef CONFIG_SCHED_MUQSS + #ifdef CONFIG_SMP + int cpupri_find(struct cpupri *cp, struct task_struct *p, struct cpumask *lowest_mask); + void cpupri_set(struct cpupri *cp, int cpu, int pri); + int cpupri_init(struct cpupri *cp); + void cpupri_cleanup(struct cpupri *cp); + #endif ++#endif +diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c +index 46ed4e1383e2..f077fcd22d2b 100644 +--- a/kernel/sched/cputime.c ++++ b/kernel/sched/cputime.c +@@ -266,26 +266,6 @@ static inline u64 account_other_time(u64 max) + return accounted; + } + +-#ifdef CONFIG_64BIT +-static inline u64 read_sum_exec_runtime(struct task_struct *t) +-{ +- return t->se.sum_exec_runtime; +-} +-#else +-static u64 read_sum_exec_runtime(struct task_struct *t) +-{ +- u64 ns; +- struct rq_flags rf; +- struct rq *rq; +- +- rq = task_rq_lock(t, &rf); +- ns = t->se.sum_exec_runtime; +- task_rq_unlock(rq, t, &rf); +- +- return ns; +-} +-#endif +- + /* + * Accumulate raw cputime values of dead tasks (sig->[us]time) and live + * tasks (sum on group iteration) belonging to @tsk's group. +@@ -663,7 +643,7 @@ void cputime_adjust(struct task_cputime *curr, struct prev_cputime *prev, + void task_cputime_adjusted(struct task_struct *p, u64 *ut, u64 *st) + { + struct task_cputime cputime = { +- .sum_exec_runtime = p->se.sum_exec_runtime, ++ .sum_exec_runtime = tsk_seruntime(p), + }; + + task_cputime(p, &cputime.utime, &cputime.stime); +diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c +index f65ef1e2f204..e0aa6c73a5fa 100644 +--- a/kernel/sched/idle.c ++++ b/kernel/sched/idle.c +@@ -225,6 +225,8 @@ static void cpuidle_idle_call(void) + static void do_idle(void) + { + int cpu = smp_processor_id(); ++ bool pending = false; ++ + /* + * If the arch has a polling bit, we maintain an invariant: + * +@@ -235,7 +237,10 @@ static void do_idle(void) + */ + + __current_set_polling(); +- tick_nohz_idle_enter(); ++ if (unlikely(softirq_pending(cpu))) ++ pending = true; ++ else ++ tick_nohz_idle_enter(); + + while (!need_resched()) { + rmb(); +@@ -273,7 +278,8 @@ static void do_idle(void) + * an IPI to fold the state for us. + */ + preempt_set_need_resched(); +- tick_nohz_idle_exit(); ++ if (!pending) ++ tick_nohz_idle_exit(); + __current_clr_polling(); + + /* +@@ -355,6 +361,7 @@ void cpu_startup_entry(enum cpuhp_state state) + do_idle(); + } + ++#ifndef CONFIG_SCHED_MUQSS + /* + * idle-task scheduling class. + */ +@@ -479,3 +486,4 @@ const struct sched_class idle_sched_class = { + .switched_to = switched_to_idle, + .update_curr = update_curr_idle, + }; ++#endif /* CONFIG_SCHED_MUQSS */ +diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h +index c8870c5bd7df..add1d74c2e91 100644 +--- a/kernel/sched/sched.h ++++ b/kernel/sched/sched.h +@@ -2,6 +2,19 @@ + /* + * Scheduler internal types and methods: + */ ++#ifdef CONFIG_SCHED_MUQSS ++#include "MuQSS.h" ++ ++/* Begin compatibility wrappers for MuQSS/CFS differences */ ++#define rq_rt_nr_running(rq) ((rq)->rt_nr_running) ++#define rq_h_nr_running(rq) ((rq)->nr_running) ++ ++#else /* CONFIG_SCHED_MUQSS */ ++ ++#define rq_rt_nr_running(rq) ((rq)->rt.rt_nr_running) ++#define rq_h_nr_running(rq) ((rq)->cfs.h_nr_running) ++ ++ + #include + + #include +@@ -2496,3 +2509,30 @@ static inline void membarrier_switch_mm(struct rq *rq, + { + } + #endif ++ ++/* MuQSS compatibility functions */ ++static inline bool softirq_pending(int cpu) ++{ ++ return false; ++} ++ ++#ifdef CONFIG_64BIT ++static inline u64 read_sum_exec_runtime(struct task_struct *t) ++{ ++ return t->se.sum_exec_runtime; ++} ++#else ++static inline u64 read_sum_exec_runtime(struct task_struct *t) ++{ ++ u64 ns; ++ struct rq_flags rf; ++ struct rq *rq; ++ ++ rq = task_rq_lock(t, &rf); ++ ns = t->se.sum_exec_runtime; ++ task_rq_unlock(rq, t, &rf); ++ ++ return ns; ++} ++#endif ++#endif /* CONFIG_SCHED_MUQSS */ +diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c +index 49b835f1305f..0253ea846c0d 100644 +--- a/kernel/sched/topology.c ++++ b/kernel/sched/topology.c +@@ -3,6 +3,7 @@ + * Scheduler topology setup/handling methods + */ + #include "sched.h" ++#include "linux/sched/deadline.h" + + DEFINE_MUTEX(sched_domains_mutex); + +@@ -442,7 +443,11 @@ void rq_attach_root(struct rq *rq, struct root_domain *rd) + struct root_domain *old_rd = NULL; + unsigned long flags; + ++#ifdef CONFIG_SCHED_MUQSS ++ raw_spin_lock_irqsave(rq->lock, flags); ++#else + raw_spin_lock_irqsave(&rq->lock, flags); ++#endif + + if (rq->rd) { + old_rd = rq->rd; +@@ -468,7 +473,11 @@ void rq_attach_root(struct rq *rq, struct root_domain *rd) + if (cpumask_test_cpu(rq->cpu, cpu_active_mask)) + set_rq_online(rq); + ++#ifdef CONFIG_SCHED_MUQSS ++ raw_spin_unlock_irqrestore(rq->lock, flags); ++#else + raw_spin_unlock_irqrestore(&rq->lock, flags); ++#endif + + if (old_rd) + call_rcu(&old_rd->rcu, free_rootdomain); +diff --git a/kernel/skip_list.c b/kernel/skip_list.c +new file mode 100644 +index 000000000000..bf5c6e97e139 +--- /dev/null ++++ b/kernel/skip_list.c +@@ -0,0 +1,148 @@ ++/* ++ Copyright (C) 2011,2016 Con Kolivas. ++ ++ Code based on example originally by William Pugh. ++ ++Skip Lists are a probabilistic alternative to balanced trees, as ++described in the June 1990 issue of CACM and were invented by ++William Pugh in 1987. ++ ++A couple of comments about this implementation: ++The routine randomLevel has been hard-coded to generate random ++levels using p=0.25. It can be easily changed. ++ ++The insertion routine has been implemented so as to use the ++dirty hack described in the CACM paper: if a random level is ++generated that is more than the current maximum level, the ++current maximum level plus one is used instead. ++ ++Levels start at zero and go up to MaxLevel (which is equal to ++MaxNumberOfLevels-1). ++ ++The routines defined in this file are: ++ ++init: defines slnode ++ ++new_skiplist: returns a new, empty list ++ ++randomLevel: Returns a random level based on a u64 random seed passed to it. ++In MuQSS, the "niffy" time is used for this purpose. ++ ++insert(l,key, value): inserts the binding (key, value) into l. This operation ++occurs in O(log n) time. ++ ++delnode(slnode, l, node): deletes any binding of key from the l based on the ++actual node value. This operation occurs in O(k) time where k is the ++number of levels of the node in question (max 8). The original delete ++function occurred in O(log n) time and involved a search. ++ ++MuQSS Notes: In this implementation of skiplists, there are bidirectional ++next/prev pointers and the insert function returns a pointer to the actual ++node the value is stored. The key here is chosen by the scheduler so as to ++sort tasks according to the priority list requirements and is no longer used ++by the scheduler after insertion. The scheduler lookup, however, occurs in ++O(1) time because it is always the first item in the level 0 linked list. ++Since the task struct stores a copy of the node pointer upon skiplist_insert, ++it can also remove it much faster than the original implementation with the ++aid of prev<->next pointer manipulation and no searching. ++ ++*/ ++ ++#include ++#include ++ ++#define MaxNumberOfLevels 8 ++#define MaxLevel (MaxNumberOfLevels - 1) ++ ++void skiplist_init(skiplist_node *slnode) ++{ ++ int i; ++ ++ slnode->key = 0xFFFFFFFFFFFFFFFF; ++ slnode->level = 0; ++ slnode->value = NULL; ++ for (i = 0; i < MaxNumberOfLevels; i++) ++ slnode->next[i] = slnode->prev[i] = slnode; ++} ++ ++skiplist *new_skiplist(skiplist_node *slnode) ++{ ++ skiplist *l = kzalloc(sizeof(skiplist), GFP_ATOMIC); ++ ++ BUG_ON(!l); ++ l->header = slnode; ++ return l; ++} ++ ++void free_skiplist(skiplist *l) ++{ ++ skiplist_node *p, *q; ++ ++ p = l->header; ++ do { ++ q = p->next[0]; ++ p->next[0]->prev[0] = q->prev[0]; ++ skiplist_node_init(p); ++ p = q; ++ } while (p != l->header); ++ kfree(l); ++} ++ ++void skiplist_node_init(skiplist_node *node) ++{ ++ memset(node, 0, sizeof(skiplist_node)); ++} ++ ++static inline unsigned int randomLevel(const long unsigned int randseed) ++{ ++ return find_first_bit(&randseed, MaxLevel) / 2; ++} ++ ++void skiplist_insert(skiplist *l, skiplist_node *node, keyType key, valueType value, unsigned int randseed) ++{ ++ skiplist_node *update[MaxNumberOfLevels]; ++ skiplist_node *p, *q; ++ int k = l->level; ++ ++ p = l->header; ++ do { ++ while (q = p->next[k], q->key <= key) ++ p = q; ++ update[k] = p; ++ } while (--k >= 0); ++ ++ ++l->entries; ++ k = randomLevel(randseed); ++ if (k > l->level) { ++ k = ++l->level; ++ update[k] = l->header; ++ } ++ ++ node->level = k; ++ node->key = key; ++ node->value = value; ++ do { ++ p = update[k]; ++ node->next[k] = p->next[k]; ++ p->next[k] = node; ++ node->prev[k] = p; ++ node->next[k]->prev[k] = node; ++ } while (--k >= 0); ++} ++ ++void skiplist_delete(skiplist *l, skiplist_node *node) ++{ ++ int k, m = node->level; ++ ++ for (k = 0; k <= m; k++) { ++ node->prev[k]->next[k] = node->next[k]; ++ node->next[k]->prev[k] = node->prev[k]; ++ } ++ skiplist_node_init(node); ++ if (m == l->level) { ++ while (l->header->next[m] == l->header && l->header->prev[m] == l->header && m > 0) ++ m--; ++ l->level = m; ++ } ++ l->entries--; ++} +diff --git a/kernel/sysctl.c b/kernel/sysctl.c +index b6f2f35d0bcf..349f5a249593 100644 +--- a/kernel/sysctl.c ++++ b/kernel/sysctl.c +@@ -130,9 +130,19 @@ static int __maybe_unused four = 4; + static unsigned long zero_ul; + static unsigned long one_ul = 1; + static unsigned long long_max = LONG_MAX; +-static int one_hundred = 100; +-static int one_thousand = 1000; +-#ifdef CONFIG_PRINTK ++static int __read_mostly one_hundred = 100; ++static int __read_mostly one_thousand = 1000; ++#ifdef CONFIG_SCHED_MUQSS ++static int zero = 0; ++static int one = 1; ++extern int rr_interval; ++extern int sched_interactive; ++extern int sched_iso_cpu; ++extern int sched_yield_type; ++#endif ++extern int hrtimer_granularity_us; ++extern int hrtimeout_min_us; ++#if defined(CONFIG_PRINTK) || defined(CONFIG_SCHED_MUQSS) + static int ten_thousand = 10000; + #endif + #ifdef CONFIG_PERF_EVENTS +@@ -300,7 +310,7 @@ static struct ctl_table sysctl_base_table[] = { + { } + }; + +-#ifdef CONFIG_SCHED_DEBUG ++#if defined(CONFIG_SCHED_DEBUG) && !defined(CONFIG_SCHED_MUQSS) + static int min_sched_granularity_ns = 100000; /* 100 usecs */ + static int max_sched_granularity_ns = NSEC_PER_SEC; /* 1 second */ + static int min_wakeup_granularity_ns; /* 0 usecs */ +@@ -317,6 +327,7 @@ static int max_extfrag_threshold = 1000; + #endif + + static struct ctl_table kern_table[] = { ++#ifndef CONFIG_SCHED_MUQSS + { + .procname = "sched_child_runs_first", + .data = &sysctl_sched_child_runs_first, +@@ -498,6 +509,7 @@ static struct ctl_table kern_table[] = { + .extra2 = SYSCTL_ONE, + }, + #endif ++#endif /* !CONFIG_SCHED_MUQSS */ + #ifdef CONFIG_PROVE_LOCKING + { + .procname = "prove_locking", +@@ -1070,6 +1082,62 @@ static struct ctl_table kern_table[] = { + .proc_handler = proc_dointvec, + }, + #endif ++#ifdef CONFIG_SCHED_MUQSS ++ { ++ .procname = "rr_interval", ++ .data = &rr_interval, ++ .maxlen = sizeof (int), ++ .mode = 0644, ++ .proc_handler = &proc_dointvec_minmax, ++ .extra1 = &one, ++ .extra2 = &one_thousand, ++ }, ++ { ++ .procname = "interactive", ++ .data = &sched_interactive, ++ .maxlen = sizeof(int), ++ .mode = 0644, ++ .proc_handler = &proc_dointvec_minmax, ++ .extra1 = &zero, ++ .extra2 = &one, ++ }, ++ { ++ .procname = "iso_cpu", ++ .data = &sched_iso_cpu, ++ .maxlen = sizeof (int), ++ .mode = 0644, ++ .proc_handler = &proc_dointvec_minmax, ++ .extra1 = &zero, ++ .extra2 = &one_hundred, ++ }, ++ { ++ .procname = "yield_type", ++ .data = &sched_yield_type, ++ .maxlen = sizeof (int), ++ .mode = 0644, ++ .proc_handler = &proc_dointvec_minmax, ++ .extra1 = &zero, ++ .extra2 = &two, ++ }, ++#endif ++ { ++ .procname = "hrtimer_granularity_us", ++ .data = &hrtimer_granularity_us, ++ .maxlen = sizeof(int), ++ .mode = 0644, ++ .proc_handler = &proc_dointvec_minmax, ++ .extra1 = &one, ++ .extra2 = &ten_thousand, ++ }, ++ { ++ .procname = "hrtimeout_min_us", ++ .data = &hrtimeout_min_us, ++ .maxlen = sizeof(int), ++ .mode = 0644, ++ .proc_handler = &proc_dointvec_minmax, ++ .extra1 = &one, ++ .extra2 = &ten_thousand, ++ }, + #if defined(CONFIG_S390) && defined(CONFIG_SMP) + { + .procname = "spin_retry", +diff --git a/kernel/time/Kconfig b/kernel/time/Kconfig +index fcc42353f125..46bb16d3c159 100644 +--- a/kernel/time/Kconfig ++++ b/kernel/time/Kconfig +@@ -66,6 +66,9 @@ config NO_HZ_COMMON + depends on !ARCH_USES_GETTIMEOFFSET && GENERIC_CLOCKEVENTS + select TICK_ONESHOT + ++config NO_HZ_FULL ++ bool ++ + choice + prompt "Timer tick handling" + default NO_HZ_IDLE if NO_HZ +@@ -87,8 +90,9 @@ config NO_HZ_IDLE + + Most of the time you want to say Y here. + +-config NO_HZ_FULL ++config NO_HZ_FULL_NODEF + bool "Full dynticks system (tickless)" ++ select NO_HZ_FULL + # NO_HZ_COMMON dependency + depends on !ARCH_USES_GETTIMEOFFSET && GENERIC_CLOCKEVENTS + # We need at least one periodic CPU for timekeeping +@@ -114,6 +118,8 @@ config NO_HZ_FULL + transitions: syscalls, exceptions and interrupts. Even when it's + dynamically off. + ++ Not recommended for desktops,laptops, or mobile devices. ++ + Say N. + + endchoice +diff --git a/kernel/time/clockevents.c b/kernel/time/clockevents.c +index f5490222e134..544c58c29267 100644 +--- a/kernel/time/clockevents.c ++++ b/kernel/time/clockevents.c +@@ -190,8 +190,9 @@ int clockevents_tick_resume(struct clock_event_device *dev) + + #ifdef CONFIG_GENERIC_CLOCKEVENTS_MIN_ADJUST + +-/* Limit min_delta to a jiffie */ +-#define MIN_DELTA_LIMIT (NSEC_PER_SEC / HZ) ++int __read_mostly hrtimer_granularity_us = 100; ++/* Limit min_delta to 100us */ ++#define MIN_DELTA_LIMIT (hrtimer_granularity_us * NSEC_PER_USEC) + + /** + * clockevents_increase_min_delta - raise minimum delta of a clock event device +diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c +index 65605530ee34..75e67a12a97b 100644 +--- a/kernel/time/hrtimer.c ++++ b/kernel/time/hrtimer.c +@@ -2206,3 +2206,113 @@ int __sched schedule_hrtimeout(ktime_t *expires, + return schedule_hrtimeout_range(expires, 0, mode); + } + EXPORT_SYMBOL_GPL(schedule_hrtimeout); ++ ++/* ++ * As per schedule_hrtimeout but taskes a millisecond value and returns how ++ * many milliseconds are left. ++ */ ++long __sched schedule_msec_hrtimeout(long timeout) ++{ ++ struct hrtimer_sleeper t; ++ int delta, jiffs; ++ ktime_t expires; ++ ++ if (!timeout) { ++ __set_current_state(TASK_RUNNING); ++ return 0; ++ } ++ ++ jiffs = msecs_to_jiffies(timeout); ++ /* ++ * If regular timer resolution is adequate or hrtimer resolution is not ++ * (yet) better than Hz, as would occur during startup, use regular ++ * timers. ++ */ ++ if (jiffs > 4 || hrtimer_resolution >= NSEC_PER_SEC / HZ || pm_freezing) ++ return schedule_timeout(jiffs); ++ ++ delta = (timeout % 1000) * NSEC_PER_MSEC; ++ expires = ktime_set(0, delta); ++ ++ hrtimer_init_sleeper_on_stack(&t, CLOCK_MONOTONIC, HRTIMER_MODE_REL); ++ hrtimer_set_expires_range_ns(&t.timer, expires, delta); ++ ++ hrtimer_sleeper_start_expires(&t, HRTIMER_MODE_REL); ++ ++ if (likely(t.task)) ++ schedule(); ++ ++ hrtimer_cancel(&t.timer); ++ destroy_hrtimer_on_stack(&t.timer); ++ ++ __set_current_state(TASK_RUNNING); ++ ++ expires = hrtimer_expires_remaining(&t.timer); ++ timeout = ktime_to_ms(expires); ++ return timeout < 0 ? 0 : timeout; ++} ++ ++EXPORT_SYMBOL(schedule_msec_hrtimeout); ++ ++#define USECS_PER_SEC 1000000 ++extern int hrtimer_granularity_us; ++ ++static inline long schedule_usec_hrtimeout(long timeout) ++{ ++ struct hrtimer_sleeper t; ++ ktime_t expires; ++ int delta; ++ ++ if (!timeout) { ++ __set_current_state(TASK_RUNNING); ++ return 0; ++ } ++ ++ if (hrtimer_resolution >= NSEC_PER_SEC / HZ) ++ return schedule_timeout(usecs_to_jiffies(timeout)); ++ ++ if (timeout < hrtimer_granularity_us) ++ timeout = hrtimer_granularity_us; ++ delta = (timeout % USECS_PER_SEC) * NSEC_PER_USEC; ++ expires = ktime_set(0, delta); ++ ++ hrtimer_init_sleeper_on_stack(&t, CLOCK_MONOTONIC, HRTIMER_MODE_REL); ++ hrtimer_set_expires_range_ns(&t.timer, expires, delta); ++ ++ hrtimer_sleeper_start_expires(&t, HRTIMER_MODE_REL); ++ ++ if (likely(t.task)) ++ schedule(); ++ ++ hrtimer_cancel(&t.timer); ++ destroy_hrtimer_on_stack(&t.timer); ++ ++ __set_current_state(TASK_RUNNING); ++ ++ expires = hrtimer_expires_remaining(&t.timer); ++ timeout = ktime_to_us(expires); ++ return timeout < 0 ? 0 : timeout; ++} ++ ++int __read_mostly hrtimeout_min_us = 500; ++ ++long __sched schedule_min_hrtimeout(void) ++{ ++ return usecs_to_jiffies(schedule_usec_hrtimeout(hrtimeout_min_us)); ++} ++ ++EXPORT_SYMBOL(schedule_min_hrtimeout); ++ ++long __sched schedule_msec_hrtimeout_interruptible(long timeout) ++{ ++ __set_current_state(TASK_INTERRUPTIBLE); ++ return schedule_msec_hrtimeout(timeout); ++} ++EXPORT_SYMBOL(schedule_msec_hrtimeout_interruptible); ++ ++long __sched schedule_msec_hrtimeout_uninterruptible(long timeout) ++{ ++ __set_current_state(TASK_UNINTERRUPTIBLE); ++ return schedule_msec_hrtimeout(timeout); ++} ++EXPORT_SYMBOL(schedule_msec_hrtimeout_uninterruptible); +diff --git a/kernel/time/posix-cpu-timers.c b/kernel/time/posix-cpu-timers.c +index 42d512fcfda2..0db83bdf7f39 100644 +--- a/kernel/time/posix-cpu-timers.c ++++ b/kernel/time/posix-cpu-timers.c +@@ -226,7 +226,7 @@ static void task_sample_cputime(struct task_struct *p, u64 *samples) + u64 stime, utime; + + task_cputime(p, &utime, &stime); +- store_samples(samples, stime, utime, p->se.sum_exec_runtime); ++ store_samples(samples, stime, utime, tsk_seruntime(p)); + } + + static void proc_sample_cputime_atomic(struct task_cputime_atomic *at, +@@ -845,7 +845,7 @@ static void check_thread_timers(struct task_struct *tsk, + soft = task_rlimit(tsk, RLIMIT_RTTIME); + if (soft != RLIM_INFINITY) { + /* Task RT timeout is accounted in jiffies. RTTIME is usec */ +- unsigned long rttime = tsk->rt.timeout * (USEC_PER_SEC / HZ); ++ unsigned long rttime = tsk_rttimeout(tsk) * (USEC_PER_SEC / HZ); + unsigned long hard = task_rlimit_max(tsk, RLIMIT_RTTIME); + + /* At the hard limit, send SIGKILL. No further action. */ +diff --git a/kernel/time/timer.c b/kernel/time/timer.c +index 4820823515e9..13034cc7c9a4 100644 +--- a/kernel/time/timer.c ++++ b/kernel/time/timer.c +@@ -43,6 +43,7 @@ + #include + #include + #include ++#include + + #include + #include +@@ -1567,7 +1568,7 @@ static unsigned long __next_timer_interrupt(struct timer_base *base) + * Check, if the next hrtimer event is before the next timer wheel + * event: + */ +-static u64 cmp_next_hrtimer_event(u64 basem, u64 expires) ++static u64 cmp_next_hrtimer_event(struct timer_base *base, u64 basem, u64 expires) + { + u64 nextevt = hrtimer_get_next_event(); + +@@ -1585,6 +1586,9 @@ static u64 cmp_next_hrtimer_event(u64 basem, u64 expires) + if (nextevt <= basem) + return basem; + ++ if (nextevt < expires && nextevt - basem <= TICK_NSEC) ++ base->is_idle = false; ++ + /* + * Round up to the next jiffie. High resolution timers are + * off, so the hrtimers are expired in the tick and we need to +@@ -1654,7 +1658,7 @@ u64 get_next_timer_interrupt(unsigned long basej, u64 basem) + } + raw_spin_unlock(&base->lock); + +- return cmp_next_hrtimer_event(basem, expires); ++ return cmp_next_hrtimer_event(base, basem, expires); + } + + /** +@@ -1889,6 +1893,18 @@ signed long __sched schedule_timeout(signed long timeout) + + expire = timeout + jiffies; + ++#ifdef CONFIG_HIGH_RES_TIMERS ++ if (timeout == 1 && hrtimer_resolution < NSEC_PER_SEC / HZ) { ++ /* ++ * Special case 1 as being a request for the minimum timeout ++ * and use highres timers to timeout after 1ms to workaround ++ * the granularity of low Hz tick timers. ++ */ ++ if (!schedule_min_hrtimeout()) ++ return 0; ++ goto out_timeout; ++ } ++#endif + timer.task = current; + timer_setup_on_stack(&timer.timer, process_timeout, 0); + __mod_timer(&timer.timer, expire, 0); +@@ -1897,10 +1913,10 @@ signed long __sched schedule_timeout(signed long timeout) + + /* Remove the timer from the object tracker */ + destroy_timer_on_stack(&timer.timer); +- ++out_timeout: + timeout = expire - jiffies; + +- out: ++out: + return timeout < 0 ? 0 : timeout; + } + EXPORT_SYMBOL(schedule_timeout); +@@ -2042,7 +2058,19 @@ void __init init_timers(void) + */ + void msleep(unsigned int msecs) + { +- unsigned long timeout = msecs_to_jiffies(msecs) + 1; ++ int jiffs = msecs_to_jiffies(msecs); ++ unsigned long timeout; ++ ++ /* ++ * Use high resolution timers where the resolution of tick based ++ * timers is inadequate. ++ */ ++ if (jiffs < 5 && hrtimer_resolution < NSEC_PER_SEC / HZ && !pm_freezing) { ++ while (msecs) ++ msecs = schedule_msec_hrtimeout_uninterruptible(msecs); ++ return; ++ } ++ timeout = jiffs + 1; + + while (timeout) + timeout = schedule_timeout_uninterruptible(timeout); +@@ -2056,7 +2084,15 @@ EXPORT_SYMBOL(msleep); + */ + unsigned long msleep_interruptible(unsigned int msecs) + { +- unsigned long timeout = msecs_to_jiffies(msecs) + 1; ++ int jiffs = msecs_to_jiffies(msecs); ++ unsigned long timeout; ++ ++ if (jiffs < 5 && hrtimer_resolution < NSEC_PER_SEC / HZ && !pm_freezing) { ++ while (msecs && !signal_pending(current)) ++ msecs = schedule_msec_hrtimeout_interruptible(msecs); ++ return msecs; ++ } ++ timeout = jiffs + 1; + + while (timeout && !signal_pending(current)) + timeout = schedule_timeout_interruptible(timeout); +diff --git a/kernel/trace/trace_selftest.c b/kernel/trace/trace_selftest.c +index 69ee8ef12cee..6edb01f2fd81 100644 +--- a/kernel/trace/trace_selftest.c ++++ b/kernel/trace/trace_selftest.c +@@ -1048,10 +1048,15 @@ static int trace_wakeup_test_thread(void *data) + { + /* Make this a -deadline thread */ + static const struct sched_attr attr = { ++#ifdef CONFIG_SCHED_MUQSS ++ /* No deadline on MuQSS, use RR */ ++ .sched_policy = SCHED_RR, ++#else + .sched_policy = SCHED_DEADLINE, + .sched_runtime = 100000ULL, + .sched_deadline = 10000000ULL, + .sched_period = 10000000ULL ++#endif + }; + struct wakeup_test_data *x = data; + +diff --git a/mm/vmscan.c b/mm/vmscan.c +index ee4eecc7e1c2..22c1b0469468 100644 +--- a/mm/vmscan.c ++++ b/mm/vmscan.c +@@ -164,7 +164,7 @@ struct scan_control { + /* + * From 0 .. 100. Higher means more swappy. + */ +-int vm_swappiness = 60; ++int vm_swappiness = 33; + /* + * The total number of pages which are beyond the high watermark within all + * zones. +diff --git a/net/core/pktgen.c b/net/core/pktgen.c +index 48b1e429857c..908c866bc9fc 100644 +--- a/net/core/pktgen.c ++++ b/net/core/pktgen.c +@@ -1894,7 +1894,7 @@ static void pktgen_mark_device(const struct pktgen_net *pn, const char *ifname) + mutex_unlock(&pktgen_thread_lock); + pr_debug("%s: waiting for %s to disappear....\n", + __func__, ifname); +- schedule_timeout_interruptible(msecs_to_jiffies(msec_per_try)); ++ schedule_msec_hrtimeout_interruptible((msec_per_try)); + mutex_lock(&pktgen_thread_lock); + + if (++i >= max_tries) { +diff --git a/sound/pci/maestro3.c b/sound/pci/maestro3.c +index 19fa73df0846..46caed9b924d 100644 +--- a/sound/pci/maestro3.c ++++ b/sound/pci/maestro3.c +@@ -2001,7 +2001,7 @@ static void snd_m3_ac97_reset(struct snd_m3 *chip) + outw(0, io + GPIO_DATA); + outw(dir | GPO_PRIMARY_AC97, io + GPIO_DIRECTION); + +- schedule_timeout_uninterruptible(msecs_to_jiffies(delay1)); ++ schedule_msec_hrtimeout_uninterruptible((delay1)); + + outw(GPO_PRIMARY_AC97, io + GPIO_DATA); + udelay(5); +@@ -2009,7 +2009,7 @@ static void snd_m3_ac97_reset(struct snd_m3 *chip) + outw(IO_SRAM_ENABLE | SERIAL_AC_LINK_ENABLE, io + RING_BUS_CTRL_A); + outw(~0, io + GPIO_MASK); + +- schedule_timeout_uninterruptible(msecs_to_jiffies(delay2)); ++ schedule_msec_hrtimeout_uninterruptible((delay2)); + + if (! snd_m3_try_read_vendor(chip)) + break; +diff --git a/sound/soc/codecs/rt5631.c b/sound/soc/codecs/rt5631.c +index f70b9f7e68bb..77b65398ca07 100644 +--- a/sound/soc/codecs/rt5631.c ++++ b/sound/soc/codecs/rt5631.c +@@ -415,7 +415,7 @@ static void onebit_depop_mute_stage(struct snd_soc_component *component, int ena + hp_zc = snd_soc_component_read32(component, RT5631_INT_ST_IRQ_CTRL_2); + snd_soc_component_write(component, RT5631_INT_ST_IRQ_CTRL_2, hp_zc & 0xf7ff); + if (enable) { +- schedule_timeout_uninterruptible(msecs_to_jiffies(10)); ++ schedule_msec_hrtimeout_uninterruptible((10)); + /* config one-bit depop parameter */ + rt5631_write_index(component, RT5631_SPK_INTL_CTRL, 0x307f); + snd_soc_component_update_bits(component, RT5631_HP_OUT_VOL, +@@ -525,7 +525,7 @@ static void depop_seq_mute_stage(struct snd_soc_component *component, int enable + hp_zc = snd_soc_component_read32(component, RT5631_INT_ST_IRQ_CTRL_2); + snd_soc_component_write(component, RT5631_INT_ST_IRQ_CTRL_2, hp_zc & 0xf7ff); + if (enable) { +- schedule_timeout_uninterruptible(msecs_to_jiffies(10)); ++ schedule_msec_hrtimeout_uninterruptible((10)); + + /* config depop sequence parameter */ + rt5631_write_index(component, RT5631_SPK_INTL_CTRL, 0x302f); +diff --git a/sound/soc/codecs/wm8350.c b/sound/soc/codecs/wm8350.c +index fe99584c917f..f1344d532a13 100644 +--- a/sound/soc/codecs/wm8350.c ++++ b/sound/soc/codecs/wm8350.c +@@ -233,10 +233,10 @@ static void wm8350_pga_work(struct work_struct *work) + out2->ramp == WM8350_RAMP_UP) { + /* delay is longer over 0dB as increases are larger */ + if (i >= WM8350_OUTn_0dB) +- schedule_timeout_interruptible(msecs_to_jiffies ++ schedule_msec_hrtimeout_interruptible( + (2)); + else +- schedule_timeout_interruptible(msecs_to_jiffies ++ schedule_msec_hrtimeout_interruptible( + (1)); + } else + udelay(50); /* doesn't matter if we delay longer */ +@@ -1120,7 +1120,7 @@ static int wm8350_set_bias_level(struct snd_soc_component *component, + (platform->dis_out4 << 6)); + + /* wait for discharge */ +- schedule_timeout_interruptible(msecs_to_jiffies ++ schedule_msec_hrtimeout_interruptible( + (platform-> + cap_discharge_msecs)); + +@@ -1136,7 +1136,7 @@ static int wm8350_set_bias_level(struct snd_soc_component *component, + WM8350_VBUFEN); + + /* wait for vmid */ +- schedule_timeout_interruptible(msecs_to_jiffies ++ schedule_msec_hrtimeout_interruptible( + (platform-> + vmid_charge_msecs)); + +@@ -1187,7 +1187,7 @@ static int wm8350_set_bias_level(struct snd_soc_component *component, + wm8350_reg_write(wm8350, WM8350_POWER_MGMT_1, pm1); + + /* wait */ +- schedule_timeout_interruptible(msecs_to_jiffies ++ schedule_msec_hrtimeout_interruptible( + (platform-> + vmid_discharge_msecs)); + +@@ -1205,7 +1205,7 @@ static int wm8350_set_bias_level(struct snd_soc_component *component, + pm1 | WM8350_OUTPUT_DRAIN_EN); + + /* wait */ +- schedule_timeout_interruptible(msecs_to_jiffies ++ schedule_msec_hrtimeout_interruptible( + (platform->drain_msecs)); + + pm1 &= ~WM8350_BIASEN; +diff --git a/sound/soc/codecs/wm8900.c b/sound/soc/codecs/wm8900.c +index 271235a69c01..3ec90e1b1eb4 100644 +--- a/sound/soc/codecs/wm8900.c ++++ b/sound/soc/codecs/wm8900.c +@@ -1109,7 +1109,7 @@ static int wm8900_set_bias_level(struct snd_soc_component *component, + /* Need to let things settle before stopping the clock + * to ensure that restart works, see "Stopping the + * master clock" in the datasheet. */ +- schedule_timeout_interruptible(msecs_to_jiffies(1)); ++ schedule_msec_hrtimeout_interruptible(1); + snd_soc_component_write(component, WM8900_REG_POWER2, + WM8900_REG_POWER2_SYSCLK_ENA); + break; +diff --git a/sound/soc/codecs/wm9713.c b/sound/soc/codecs/wm9713.c +index 6497c1ea6228..08fefeca9d82 100644 +--- a/sound/soc/codecs/wm9713.c ++++ b/sound/soc/codecs/wm9713.c +@@ -199,7 +199,7 @@ static int wm9713_voice_shutdown(struct snd_soc_dapm_widget *w, + + /* Gracefully shut down the voice interface. */ + snd_soc_component_update_bits(component, AC97_HANDSET_RATE, 0x0f00, 0x0200); +- schedule_timeout_interruptible(msecs_to_jiffies(1)); ++ schedule_msec_hrtimeout_interruptible(1); + snd_soc_component_update_bits(component, AC97_HANDSET_RATE, 0x0f00, 0x0f00); + snd_soc_component_update_bits(component, AC97_EXTENDED_MID, 0x1000, 0x1000); + +@@ -868,7 +868,7 @@ static int wm9713_set_pll(struct snd_soc_component *component, + wm9713->pll_in = freq_in; + + /* wait 10ms AC97 link frames for the link to stabilise */ +- schedule_timeout_interruptible(msecs_to_jiffies(10)); ++ schedule_msec_hrtimeout_interruptible((10)); + return 0; + } + +diff --git a/sound/soc/soc-dapm.c b/sound/soc/soc-dapm.c +index b6378f025836..5f5e58655d32 100644 +--- a/sound/soc/soc-dapm.c ++++ b/sound/soc/soc-dapm.c +@@ -154,7 +154,7 @@ static void dapm_assert_locked(struct snd_soc_dapm_context *dapm) + static void pop_wait(u32 pop_time) + { + if (pop_time) +- schedule_timeout_uninterruptible(msecs_to_jiffies(pop_time)); ++ schedule_msec_hrtimeout_uninterruptible((pop_time)); + } + + __printf(3, 4) +diff --git a/sound/usb/line6/pcm.c b/sound/usb/line6/pcm.c +index f70211e6b174..5ae4421225e6 100644 +--- a/sound/usb/line6/pcm.c ++++ b/sound/usb/line6/pcm.c +@@ -127,7 +127,7 @@ static void line6_wait_clear_audio_urbs(struct snd_line6_pcm *line6pcm, + if (!alive) + break; + set_current_state(TASK_UNINTERRUPTIBLE); +- schedule_timeout(1); ++ schedule_min_hrtimeout(); + } while (--timeout > 0); + if (alive) + dev_err(line6pcm->line6->ifcdev, diff --git a/linux54-tkg/linux54-tkg-patches/0004-glitched-muqss.patch b/linux54-tkg/linux54-tkg-patches/0004-glitched-muqss.patch new file mode 100644 index 0000000..2c4837e --- /dev/null +++ b/linux54-tkg/linux54-tkg-patches/0004-glitched-muqss.patch @@ -0,0 +1,78 @@ +From f7f49141a5dbe9c99d78196b58c44307fb2e6be3 Mon Sep 17 00:00:00 2001 +From: Tk-Glitch +Date: Wed, 4 Jul 2018 04:30:08 +0200 +Subject: glitched - MuQSS + +diff --git a/kernel/sched/MuQSS.c b/kernel/sched/MuQSS.c +index 84a1d08d68551..57c3036a68952 100644 +--- a/kernel/sched/MuQSS.c ++++ b/kernel/sched/MuQSS.c +@@ -163,7 +167,11 @@ int sched_interactive __read_mostly = 1; + * are allowed to run five seconds as real time tasks. This is the total over + * all online cpus. + */ ++#ifdef CONFIG_ZENIFY ++int sched_iso_cpu __read_mostly = 25; ++#else + int sched_iso_cpu __read_mostly = 70; ++#endif + + /* + * sched_yield_type - Choose what sort of yield sched_yield will perform. + +diff --git a/kernel/Kconfig.hz b/kernel/Kconfig.hz +index 2a202a846757..1d9c7ed79b11 100644 +--- a/kernel/Kconfig.hz ++++ b/kernel/Kconfig.hz +@@ -5,7 +5,7 @@ + choice + prompt "Timer frequency" + default HZ_100 if SCHED_MUQSS +- default HZ_250_NODEF if !SCHED_MUQSS ++ default HZ_500_NODEF if !SCHED_MUQSS + help + Allows the configuration of the timer frequency. It is customary + to have the timer interrupt run at 1000 Hz but 100 Hz may be more +@@ -50,6 +50,20 @@ choice + on SMP and NUMA systems and exactly dividing by both PAL and + NTSC frame rates for video and multimedia work. + ++ config HZ_500_NODEF ++ bool "500 HZ" ++ help ++ 500 Hz is a good timer frequency for desktops. Provides fast ++ interactivity with great smoothness without sacrificing too ++ much throughput. ++ ++ config HZ_750_NODEF ++ bool "750 HZ" ++ help ++ 750 Hz is a good timer frequency for desktops. Provides fast ++ interactivity with great smoothness without sacrificing too ++ much throughput. ++ + config HZ_1000_NODEF + bool "1000 HZ" + help +@@ -63,6 +70,8 @@ config HZ + default 100 if HZ_100 + default 250 if HZ_250_NODEF + default 300 if HZ_300_NODEF ++ default 500 if HZ_500_NODEF ++ default 750 if HZ_750_NODEF + default 1000 if HZ_1000_NODEF + + config SCHED_HRTICK + +diff --git a/Makefile b/Makefile +index d4d36c61940b..4a9dfe471f1f 100644 +--- a/Makefile ++++ b/Makefile +@@ -15,7 +15,6 @@ NAME = Kleptomaniac Octopus + + CKVERSION = -ck1 + CKNAME = MuQSS Powered +-EXTRAVERSION := $(EXTRAVERSION)$(CKVERSION) + + # We are using a recursive build, so we need to do a little thinking + # to get the ordering right. diff --git a/linux54-tkg/linux54-tkg-patches/0004-glitched-ondemand-muqss.patch b/linux54-tkg/linux54-tkg-patches/0004-glitched-ondemand-muqss.patch new file mode 100644 index 0000000..02933e4 --- /dev/null +++ b/linux54-tkg/linux54-tkg-patches/0004-glitched-ondemand-muqss.patch @@ -0,0 +1,18 @@ +diff --git a/drivers/cpufreq/cpufreq_ondemand.c b/drivers/cpufreq/cpufreq_ondemand.c +index 6b423eebfd5d..61e3271675d6 100644 +--- a/drivers/cpufreq/cpufreq_ondemand.c ++++ b/drivers/cpufreq/cpufreq_ondemand.c +@@ -21,10 +21,10 @@ + #include "cpufreq_ondemand.h" + + /* On-demand governor macros */ +-#define DEF_FREQUENCY_UP_THRESHOLD (80) +-#define DEF_SAMPLING_DOWN_FACTOR (1) ++#define DEF_FREQUENCY_UP_THRESHOLD (45) ++#define DEF_SAMPLING_DOWN_FACTOR (5) + #define MAX_SAMPLING_DOWN_FACTOR (100000) +-#define MICRO_FREQUENCY_UP_THRESHOLD (95) ++#define MICRO_FREQUENCY_UP_THRESHOLD (45) + #define MICRO_FREQUENCY_MIN_SAMPLE_RATE (10000) + #define MIN_FREQUENCY_UP_THRESHOLD (1) + #define MAX_FREQUENCY_UP_THRESHOLD (100) diff --git a/linux54-tkg/linux54-tkg-patches/0005-glitched-ondemand-pds.patch b/linux54-tkg/linux54-tkg-patches/0005-glitched-ondemand-pds.patch new file mode 100644 index 0000000..c1929e8 --- /dev/null +++ b/linux54-tkg/linux54-tkg-patches/0005-glitched-ondemand-pds.patch @@ -0,0 +1,18 @@ +diff --git a/drivers/cpufreq/cpufreq_ondemand.c b/drivers/cpufreq/cpufreq_ondemand.c +index 6b423eebfd5d..61e3271675d6 100644 +--- a/drivers/cpufreq/cpufreq_ondemand.c ++++ b/drivers/cpufreq/cpufreq_ondemand.c +@@ -21,10 +21,10 @@ + #include "cpufreq_ondemand.h" + + /* On-demand governor macros */ +-#define DEF_FREQUENCY_UP_THRESHOLD (63) +-#define DEF_SAMPLING_DOWN_FACTOR (1) ++#define DEF_FREQUENCY_UP_THRESHOLD (55) ++#define DEF_SAMPLING_DOWN_FACTOR (5) + #define MAX_SAMPLING_DOWN_FACTOR (100000) +-#define MICRO_FREQUENCY_UP_THRESHOLD (95) ++#define MICRO_FREQUENCY_UP_THRESHOLD (63) + #define MICRO_FREQUENCY_MIN_SAMPLE_RATE (10000) + #define MIN_FREQUENCY_UP_THRESHOLD (1) + #define MAX_FREQUENCY_UP_THRESHOLD (100) diff --git a/linux54-tkg/linux54-tkg-patches/0005-glitched-pds.patch b/linux54-tkg/linux54-tkg-patches/0005-glitched-pds.patch new file mode 100644 index 0000000..21f2d69 --- /dev/null +++ b/linux54-tkg/linux54-tkg-patches/0005-glitched-pds.patch @@ -0,0 +1,213 @@ +From f7f49141a5dbe9c99d78196b58c44307fb2e6be3 Mon Sep 17 00:00:00 2001 +From: Tk-Glitch +Date: Wed, 4 Jul 2018 04:30:08 +0200 +Subject: glitched - PDS + +diff --git a/kernel/Kconfig.hz b/kernel/Kconfig.hz +index 2a202a846757..1d9c7ed79b11 100644 +--- a/kernel/Kconfig.hz ++++ b/kernel/Kconfig.hz +@@ -4,7 +4,7 @@ + + choice + prompt "Timer frequency" +- default HZ_250 ++ default HZ_500 + help + Allows the configuration of the timer frequency. It is customary + to have the timer interrupt run at 1000 Hz but 100 Hz may be more +@@ -39,6 +39,13 @@ choice + on SMP and NUMA systems and exactly dividing by both PAL and + NTSC frame rates for video and multimedia work. + ++ config HZ_500 ++ bool "500 HZ" ++ help ++ 500 Hz is a balanced timer frequency. Provides fast interactivity ++ on desktops with great smoothness without increasing CPU power ++ consumption and sacrificing the battery life on laptops. ++ + config HZ_1000 + bool "1000 HZ" + help +@@ -52,6 +59,7 @@ config HZ + default 100 if HZ_100 + default 250 if HZ_250 + default 300 if HZ_300 ++ default 500 if HZ_500 + default 1000 if HZ_1000 + + config SCHED_HRTICK + +diff --git a/kernel/Kconfig.hz b/kernel/Kconfig.hz +index 2a202a846757..1d9c7ed79b11 100644 +--- a/kernel/Kconfig.hz ++++ b/kernel/Kconfig.hz +@@ -4,7 +4,7 @@ + + choice + prompt "Timer frequency" +- default HZ_500 ++ default HZ_750 + help + Allows the configuration of the timer frequency. It is customary + to have the timer interrupt run at 1000 Hz but 100 Hz may be more +@@ -46,6 +46,13 @@ choice + on desktops with great smoothness without increasing CPU power + consumption and sacrificing the battery life on laptops. + ++ config HZ_750 ++ bool "750 HZ" ++ help ++ 750 Hz is a good timer frequency for desktops. Provides fast ++ interactivity with great smoothness without sacrificing too ++ much throughput. ++ + config HZ_1000 + bool "1000 HZ" + help +@@ -60,6 +67,7 @@ config HZ + default 250 if HZ_250 + default 300 if HZ_300 + default 500 if HZ_500 ++ default 750 if HZ_750 + default 1000 if HZ_1000 + + config SCHED_HRTICK + +diff --git a/mm/vmscan.c b/mm/vmscan.c +index 9270a4370d54..30d01e647417 100644 +--- a/mm/vmscan.c ++++ b/mm/vmscan.c +@@ -159,7 +159,7 @@ struct scan_control { + /* + * From 0 .. 100. Higher means more swappy. + */ +-int vm_swappiness = 60; ++int vm_swappiness = 20; + /* + * The total number of pages which are beyond the high watermark within all + * zones. + +diff --git a/kernel/sched/pds.c b/kernel/sched/pds.c +index c2d831b242b6d18a47e0d87a9f5433a7748b52ff..5bc8d7a8f920c21feab69b2706a3328dc8d39f9a 100644 +--- a/kernel/sched/pds.c ++++ b/kernel/sched/pds.c +@@ -409,12 +409,11 @@ struct rq *task_rq_lock(struct task_struct *p, struct rq_flags *rf) + * [L] ->on_rq + * RELEASE (rq->lock) + * +- * If we observe the old CPU in task_rq_lock(), the acquire of ++ * If we observe the old CPU in task_rq_lock, the acquire of + * the old rq->lock will fully serialize against the stores. + * +- * If we observe the new CPU in task_rq_lock(), the address +- * dependency headed by '[L] rq = task_rq()' and the acquire +- * will pair with the WMB to ensure we then also see migrating. ++ * If we observe the new CPU in task_rq_lock, the acquire will ++ * pair with the WMB to ensure we must then also see migrating. + */ + if (likely(rq == task_rq(p) && !task_on_rq_migrating(p))) { + return rq; +@@ -952,9 +953,9 @@ static inline void __set_task_cpu(struct task_struct *p, unsigned int cpu) + smp_wmb(); + + #ifdef CONFIG_THREAD_INFO_IN_TASK +- WRITE_ONCE(p->cpu, cpu); ++ p->cpu = cpu; + #else +- WRITE_ONCE(task_thread_info(p)->cpu, cpu); ++ task_thread_info(p)->cpu = cpu; + #endif + #endif + } +@@ -1035,7 +1036,7 @@ static void detach_task(struct rq *rq, struct task_struct *p, int target_cpu) + { + lockdep_assert_held(&rq->lock); + +- WRITE_ONCE(p->on_rq ,TASK_ON_RQ_MIGRATING); ++ p->on_rq = TASK_ON_RQ_MIGRATING; + if (task_contributes_to_load(p)) + rq->nr_uninterruptible++; + dequeue_task(p, rq, 0); +diff --git a/kernel/sched/pds_sched.h b/kernel/sched/pds_sched.h +index 20dcf19ea057627d91be07b4ec20f0827c30084c..24fa90ca63d144cc4f45d82d88407ea70d2d2edf 100644 +--- a/kernel/sched/pds_sched.h ++++ b/kernel/sched/pds_sched.h +@@ -56,7 +56,7 @@ static inline int task_on_rq_queued(struct task_struct *p) + + static inline int task_on_rq_migrating(struct task_struct *p) + { +- return READ_ONCE(p->on_rq) == TASK_ON_RQ_MIGRATING; ++ return p->on_rq == TASK_ON_RQ_MIGRATING; + } + + enum { + +diff --git a/init/Kconfig b/init/Kconfig +index 11fd9b502d06..e9bc34d3019b 100644 +--- a/init/Kconfig ++++ b/init/Kconfig +@@ -948,7 +948,6 @@ config CGROUP_DEVICE + + config CGROUP_CPUACCT + bool "Simple CPU accounting controller" +- depends on !SCHED_PDS + help + Provides a simple controller for monitoring the + total CPU consumed by the tasks in a cgroup. +diff --git a/kernel/sched/Makefile b/kernel/sched/Makefile +index b23231bae996..cab4e5c5b38e 100644 +--- a/kernel/sched/Makefile ++++ b/kernel/sched/Makefile +@@ -24,13 +24,13 @@ obj-y += fair.o rt.o deadline.o + obj-$(CONFIG_SMP) += cpudeadline.o topology.o stop_task.o + obj-$(CONFIG_SCHED_AUTOGROUP) += autogroup.o + obj-$(CONFIG_SCHED_DEBUG) += debug.o +-obj-$(CONFIG_CGROUP_CPUACCT) += cpuacct.o + endif + obj-y += loadavg.o clock.o cputime.o + obj-y += idle.o + obj-y += wait.o wait_bit.o swait.o completion.o + obj-$(CONFIG_SMP) += cpupri.o pelt.o + obj-$(CONFIG_SCHEDSTATS) += stats.o ++obj-$(CONFIG_CGROUP_CPUACCT) += cpuacct.o + obj-$(CONFIG_CPU_FREQ) += cpufreq.o + obj-$(CONFIG_CPU_FREQ_GOV_SCHEDUTIL) += cpufreq_schedutil.o + obj-$(CONFIG_MEMBARRIER) += membarrier.o + +diff --git a/kernel/sched/pds.c b/kernel/sched/pds.c +index 9281ad164..f09a609cf 100644 +--- a/kernel/sched/pds.c ++++ b/kernel/sched/pds.c +@@ -81,6 +81,18 @@ enum { + NR_CPU_AFFINITY_CHK_LEVEL + }; + ++/* ++ * This allows printing both to /proc/sched_debug and ++ * to the console ++ */ ++#define SEQ_printf(m, x...) \ ++ do { \ ++ if (m) \ ++ seq_printf(m, x); \ ++ else \ ++ pr_cont(x); \ ++ } while (0) ++ + static inline void print_scheduler_version(void) + { + printk(KERN_INFO "pds: PDS-mq CPU Scheduler 0.99o by Alfred Chen.\n"); +@@ -6353,7 +6365,10 @@ void ia64_set_curr_task(int cpu, struct task_struct *p) + #ifdef CONFIG_SCHED_DEBUG + void proc_sched_show_task(struct task_struct *p, struct pid_namespace *ns, + struct seq_file *m) +-{} ++{ ++ SEQ_printf(m, "%s (%d, #threads: %d)\n", p->comm, task_pid_nr_ns(p, ns), ++ get_nr_threads(p)); ++} + + void proc_sched_set_task(struct task_struct *p) + {} diff --git a/linux54-tkg/linux54-tkg-patches/0005-v5.4_undead-pds099o.patch b/linux54-tkg/linux54-tkg-patches/0005-v5.4_undead-pds099o.patch new file mode 100644 index 0000000..e6db1ad --- /dev/null +++ b/linux54-tkg/linux54-tkg-patches/0005-v5.4_undead-pds099o.patch @@ -0,0 +1,8387 @@ +From 89067d28ca90681fc6cf108de79b9aedb93dfa9d Mon Sep 17 00:00:00 2001 +From: Tk-Glitch +Date: Mon, 25 Nov 2019 21:46:23 +0100 +Subject: PDS 099o, 5.4 rebase + + +diff --git a/Documentation/admin-guide/sysctl/kernel.rst b/Documentation/admin-guide/sysctl/kernel.rst +index 032c7cd3cede..360a229b0abe 100644 +--- a/Documentation/admin-guide/sysctl/kernel.rst ++++ b/Documentation/admin-guide/sysctl/kernel.rst +@@ -82,6 +82,7 @@ show up in /proc/sys/kernel: + - randomize_va_space + - real-root-dev ==> Documentation/admin-guide/initrd.rst + - reboot-cmd [ SPARC only ] ++- rr_interval + - rtsig-max + - rtsig-nr + - sched_energy_aware +@@ -105,6 +106,7 @@ show up in /proc/sys/kernel: + - unknown_nmi_panic + - watchdog + - watchdog_thresh ++- yield_type + - version + + +diff --git a/Documentation/scheduler/sched-PDS-mq.txt b/Documentation/scheduler/sched-PDS-mq.txt +new file mode 100644 +index 000000000000..709e86f6487e +--- /dev/null ++++ b/Documentation/scheduler/sched-PDS-mq.txt +@@ -0,0 +1,56 @@ ++ Priority and Deadline based Skiplist multiple queue Scheduler ++ ------------------------------------------------------------- ++ ++CONTENT ++======== ++ ++ 0. Development ++ 1. Overview ++ 1.1 Design goal ++ 1.2 Design summary ++ 2. Design Detail ++ 2.1 Skip list implementation ++ 2.2 Task preempt ++ 2.3 Task policy, priority and deadline ++ 2.4 Task selection ++ 2.5 Run queue balance ++ 2.6 Task migration ++ ++ ++0. Development ++============== ++ ++Priority and Deadline based Skiplist multiple queue scheduler, referred to as ++PDS from here on, is developed upon the enhancement patchset VRQ(Variable Run ++Queue) for BFS(Brain Fuck Scheduler by Con Kolivas). PDS inherits the existing ++design from VRQ and inspired by the introduction of skiplist data structure ++to the scheduler by Con Kolivas. However, PDS is different from MuQSS(Multiple ++Queue Skiplist Scheduler, the successor after BFS) in many ways. ++ ++1. Overview ++=========== ++ ++1.1 Design goal ++--------------- ++ ++PDS is designed to make the cpu process scheduler code to be simple, but while ++efficiency and scalable. Be Simple, the scheduler code will be easy to be read ++and the behavious of scheduler will be easy to predict. Be efficiency, the ++scheduler shall be well balance the thoughput performance and task interactivity ++at the same time for different properties the tasks behave. Be scalable, the ++performance of the scheduler should be in good shape with the glowing of ++workload or with the growing of the cpu numbers. ++ ++1.2 Design summary ++------------------ ++ ++PDS is described as a multiple run queues cpu scheduler. Each cpu has its own ++run queue. A heavry customized skiplist is used as the backend data structure ++of the cpu run queue. Tasks in run queue is sorted by priority then virtual ++deadline(simplfy to just deadline from here on). In PDS, balance action among ++run queues are kept as less as possible to reduce the migration cost. Cpumask ++data structure is widely used in cpu affinity checking and cpu preemption/ ++selection to make PDS scalable with increasing cpu number. ++ ++ ++To be continued... +diff --git a/arch/powerpc/platforms/cell/spufs/sched.c b/arch/powerpc/platforms/cell/spufs/sched.c +index f18d5067cd0f..fe489fc01c73 100644 +--- a/arch/powerpc/platforms/cell/spufs/sched.c ++++ b/arch/powerpc/platforms/cell/spufs/sched.c +@@ -51,11 +51,6 @@ static struct task_struct *spusched_task; + static struct timer_list spusched_timer; + static struct timer_list spuloadavg_timer; + +-/* +- * Priority of a normal, non-rt, non-niced'd process (aka nice level 0). +- */ +-#define NORMAL_PRIO 120 +- + /* + * Frequency of the spu scheduler tick. By default we do one SPU scheduler + * tick for every 10 CPU scheduler ticks. +diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig +index 8ef85139553f..9d44d8d78259 100644 +--- a/arch/x86/Kconfig ++++ b/arch/x86/Kconfig +@@ -1034,6 +1034,22 @@ config NR_CPUS + config SCHED_SMT + def_bool y if SMP + ++config SMT_NICE ++ bool "SMT (Hyperthreading) aware nice priority and policy support" ++ depends on SCHED_PDS && SCHED_SMT ++ default y ++ ---help--- ++ Enabling Hyperthreading on Intel CPUs decreases the effectiveness ++ of the use of 'nice' levels and different scheduling policies ++ (e.g. realtime) due to sharing of CPU power between hyperthreads. ++ SMT nice support makes each logical CPU aware of what is running on ++ its hyperthread siblings, maintaining appropriate distribution of ++ CPU according to nice levels and scheduling policies at the expense ++ of slightly increased overhead. ++ ++ If unsure say Y here. ++ ++ + config SCHED_MC + def_bool y + prompt "Multi-core scheduler support" +diff --git a/drivers/cpufreq/cpufreq_conservative.c b/drivers/cpufreq/cpufreq_conservative.c +index b66e81c06a57..a294f8f5fd75 100644 +--- a/drivers/cpufreq/cpufreq_conservative.c ++++ b/drivers/cpufreq/cpufreq_conservative.c +@@ -28,8 +28,8 @@ struct cs_dbs_tuners { + }; + + /* Conservative governor macros */ +-#define DEF_FREQUENCY_UP_THRESHOLD (80) +-#define DEF_FREQUENCY_DOWN_THRESHOLD (20) ++#define DEF_FREQUENCY_UP_THRESHOLD (63) ++#define DEF_FREQUENCY_DOWN_THRESHOLD (26) + #define DEF_FREQUENCY_STEP (5) + #define DEF_SAMPLING_DOWN_FACTOR (1) + #define MAX_SAMPLING_DOWN_FACTOR (10) +diff --git a/drivers/cpufreq/cpufreq_ondemand.c b/drivers/cpufreq/cpufreq_ondemand.c +index dced033875bf..d2cd03766b09 100644 +--- a/drivers/cpufreq/cpufreq_ondemand.c ++++ b/drivers/cpufreq/cpufreq_ondemand.c +@@ -18,7 +18,7 @@ + #include "cpufreq_ondemand.h" + + /* On-demand governor macros */ +-#define DEF_FREQUENCY_UP_THRESHOLD (80) ++#define DEF_FREQUENCY_UP_THRESHOLD (63) + #define DEF_SAMPLING_DOWN_FACTOR (1) + #define MAX_SAMPLING_DOWN_FACTOR (100000) + #define MICRO_FREQUENCY_UP_THRESHOLD (95) +@@ -127,7 +127,7 @@ static void dbs_freq_increase(struct cpufreq_policy *policy, unsigned int freq) + } + + /* +- * Every sampling_rate, we check, if current idle time is less than 20% ++ * Every sampling_rate, we check, if current idle time is less than 37% + * (default), then we try to increase frequency. Else, we adjust the frequency + * proportional to load. + */ +diff --git a/fs/proc/base.c b/fs/proc/base.c +index ebea9501afb8..51c9346a69fe 100644 +--- a/fs/proc/base.c ++++ b/fs/proc/base.c +@@ -477,7 +477,7 @@ static int proc_pid_schedstat(struct seq_file *m, struct pid_namespace *ns, + seq_puts(m, "0 0 0\n"); + else + seq_printf(m, "%llu %llu %lu\n", +- (unsigned long long)task->se.sum_exec_runtime, ++ (unsigned long long)tsk_seruntime(task), + (unsigned long long)task->sched_info.run_delay, + task->sched_info.pcount); + +diff --git a/include/linux/init_task.h b/include/linux/init_task.h +index 2c620d7ac432..1a7987c40c80 100644 +--- a/include/linux/init_task.h ++++ b/include/linux/init_task.h +@@ -36,7 +36,11 @@ extern struct cred init_cred; + #define INIT_PREV_CPUTIME(x) + #endif + ++#ifdef CONFIG_SCHED_PDS ++#define INIT_TASK_COMM "PDS" ++#else + #define INIT_TASK_COMM "swapper" ++#endif /* !CONFIG_SCHED_PDS */ + + /* Attach to the init_task data structure for proper alignment */ + #ifdef CONFIG_ARCH_TASK_STRUCT_ON_STACK +diff --git a/include/linux/jiffies.h b/include/linux/jiffies.h +index 1b6d31da7cbc..dea181bdb1dd 100644 +--- a/include/linux/jiffies.h ++++ b/include/linux/jiffies.h +@@ -171,7 +171,7 @@ static inline u64 get_jiffies_64(void) + * Have the 32 bit jiffies value wrap 5 minutes after boot + * so jiffies wrap bugs show up earlier. + */ +-#define INITIAL_JIFFIES ((unsigned long)(unsigned int) (-300*HZ)) ++#define INITIAL_JIFFIES ((unsigned long)(unsigned int) (-10*HZ)) + + /* + * Change timeval to jiffies, trying to avoid the +diff --git a/include/linux/sched.h b/include/linux/sched.h +index 67a1d86981a9..8268cad4b0a2 100644 +--- a/include/linux/sched.h ++++ b/include/linux/sched.h +@@ -31,6 +31,7 @@ + #include + #include + #include ++#include + + /* task_struct member predeclarations (sorted alphabetically): */ + struct audit_context; +@@ -644,9 +645,13 @@ struct task_struct { + unsigned int flags; + unsigned int ptrace; + +-#ifdef CONFIG_SMP ++#if defined(CONFIG_SMP) && !defined(CONFIG_SCHED_PDS) + struct llist_node wake_entry; ++#endif ++#if defined(CONFIG_SMP) || defined(CONFIG_SCHED_PDS) + int on_cpu; ++#endif ++#ifdef CONFIG_SMP + #ifdef CONFIG_THREAD_INFO_IN_TASK + /* Current CPU: */ + unsigned int cpu; +@@ -655,6 +660,7 @@ struct task_struct { + unsigned long wakee_flip_decay_ts; + struct task_struct *last_wakee; + ++#ifndef CONFIG_SCHED_PDS + /* + * recent_used_cpu is initially set as the last CPU used by a task + * that wakes affine another task. Waker/wakee relationships can +@@ -663,6 +669,7 @@ struct task_struct { + * used CPU that may be idle. + */ + int recent_used_cpu; ++#endif /* CONFIG_SCHED_PDS */ + int wake_cpu; + #endif + int on_rq; +@@ -672,13 +679,27 @@ struct task_struct { + int normal_prio; + unsigned int rt_priority; + ++#ifdef CONFIG_SCHED_PDS ++ int time_slice; ++ u64 deadline; ++ /* skip list level */ ++ int sl_level; ++ /* skip list node */ ++ struct skiplist_node sl_node; ++ /* 8bits prio and 56bits deadline for quick processing */ ++ u64 priodl; ++ u64 last_ran; ++ /* sched_clock time spent running */ ++ u64 sched_time; ++#else /* CONFIG_SCHED_PDS */ + const struct sched_class *sched_class; + struct sched_entity se; + struct sched_rt_entity rt; ++ struct sched_dl_entity dl; ++#endif + #ifdef CONFIG_CGROUP_SCHED + struct task_group *sched_task_group; + #endif +- struct sched_dl_entity dl; + + #ifdef CONFIG_UCLAMP_TASK + /* Clamp values requested for a scheduling entity */ +@@ -1283,6 +1304,29 @@ struct task_struct { + */ + }; + ++#ifdef CONFIG_SCHED_PDS ++void cpu_scaling(int cpu); ++void cpu_nonscaling(int cpu); ++#define tsk_seruntime(t) ((t)->sched_time) ++/* replace the uncertian rt_timeout with 0UL */ ++#define tsk_rttimeout(t) (0UL) ++ ++#define task_running_idle(p) ((p)->prio == IDLE_PRIO) ++#else /* CFS */ ++extern int runqueue_is_locked(int cpu); ++static inline void cpu_scaling(int cpu) ++{ ++} ++ ++static inline void cpu_nonscaling(int cpu) ++{ ++} ++#define tsk_seruntime(t) ((t)->se.sum_exec_runtime) ++#define tsk_rttimeout(t) ((t)->rt.timeout) ++ ++#define iso_task(p) (false) ++#endif /* CONFIG_SCHED_PDS */ ++ + static inline struct pid *task_pid(struct task_struct *task) + { + return task->thread_pid; +diff --git a/include/linux/sched/deadline.h b/include/linux/sched/deadline.h +index 1aff00b65f3c..a5e5fc2c9170 100644 +--- a/include/linux/sched/deadline.h ++++ b/include/linux/sched/deadline.h +@@ -1,5 +1,22 @@ + /* SPDX-License-Identifier: GPL-2.0 */ + ++#ifdef CONFIG_SCHED_PDS ++ ++#define __tsk_deadline(p) ((p)->deadline) ++ ++static inline int dl_prio(int prio) ++{ ++ return 1; ++} ++ ++static inline int dl_task(struct task_struct *p) ++{ ++ return 1; ++} ++#else ++ ++#define __tsk_deadline(p) ((p)->dl.deadline) ++ + /* + * SCHED_DEADLINE tasks has negative priorities, reflecting + * the fact that any of them has higher prio than RT and +@@ -19,6 +36,7 @@ static inline int dl_task(struct task_struct *p) + { + return dl_prio(p->prio); + } ++#endif /* CONFIG_SCHED_PDS */ + + static inline bool dl_time_before(u64 a, u64 b) + { +diff --git a/include/linux/sched/prio.h b/include/linux/sched/prio.h +index 7d64feafc408..fba04bb91492 100644 +--- a/include/linux/sched/prio.h ++++ b/include/linux/sched/prio.h +@@ -20,7 +20,18 @@ + */ + + #define MAX_USER_RT_PRIO 100 ++ ++#ifdef CONFIG_SCHED_PDS ++#define ISO_PRIO (MAX_USER_RT_PRIO) ++ ++#define MAX_RT_PRIO ((MAX_USER_RT_PRIO) + 1) ++ ++#define NORMAL_PRIO (MAX_RT_PRIO) ++#define IDLE_PRIO ((MAX_RT_PRIO) + 1) ++#define PRIO_LIMIT ((IDLE_PRIO) + 1) ++#else /* !CONFIG_SCHED_PDS */ + #define MAX_RT_PRIO MAX_USER_RT_PRIO ++#endif /* CONFIG_SCHED_PDS */ + + #define MAX_PRIO (MAX_RT_PRIO + NICE_WIDTH) + #define DEFAULT_PRIO (MAX_RT_PRIO + NICE_WIDTH / 2) +diff --git a/include/linux/sched/rt.h b/include/linux/sched/rt.h +index e5af028c08b4..a96012e6f15e 100644 +--- a/include/linux/sched/rt.h ++++ b/include/linux/sched/rt.h +@@ -24,8 +24,10 @@ static inline bool task_is_realtime(struct task_struct *tsk) + + if (policy == SCHED_FIFO || policy == SCHED_RR) + return true; ++#ifndef CONFIG_SCHED_PDS + if (policy == SCHED_DEADLINE) + return true; ++#endif + return false; + } + +diff --git a/include/linux/sched/task.h b/include/linux/sched/task.h +index 4b1c3b664f51..f186b8119ad6 100644 +--- a/include/linux/sched/task.h ++++ b/include/linux/sched/task.h +@@ -99,7 +99,7 @@ extern long kernel_wait4(pid_t, int __user *, int, struct rusage *); + extern void free_task(struct task_struct *tsk); + + /* sched_exec is called by processes performing an exec */ +-#ifdef CONFIG_SMP ++#if defined(CONFIG_SMP) && !defined(CONFIG_SCHED_PDS) + extern void sched_exec(void); + #else + #define sched_exec() {} +diff --git a/include/linux/skip_list.h b/include/linux/skip_list.h +new file mode 100644 +index 000000000000..713fedd8034f +--- /dev/null ++++ b/include/linux/skip_list.h +@@ -0,0 +1,177 @@ ++/* ++ Copyright (C) 2016 Alfred Chen. ++ ++ Code based on Con Kolivas's skip list implementation for BFS, and ++ which is based on example originally by William Pugh. ++ ++Skip Lists are a probabilistic alternative to balanced trees, as ++described in the June 1990 issue of CACM and were invented by ++William Pugh in 1987. ++ ++A couple of comments about this implementation: ++ ++This file only provides a infrastructure of skip list. ++ ++skiplist_node is embedded into container data structure, to get rid the ++dependency of kmalloc/kfree operation in scheduler code. ++ ++A customized search function should be defined using DEFINE_SKIPLIST_INSERT ++macro and be used for skip list insert operation. ++ ++Random Level is also not defined in this file, instead, it should be customized ++implemented and set to node->level then pass to the customized skiplist_insert ++function. ++ ++Levels start at zero and go up to (NUM_SKIPLIST_LEVEL -1) ++ ++NUM_SKIPLIST_LEVEL in this implementation is 8 instead of origin 16, ++considering that there will be 256 entries to enable the top level when using ++random level p=0.5, and that number is more than enough for a run queue usage ++in a scheduler usage. And it also help to reduce the memory usage of the ++embedded skip list node in task_struct to about 50%. ++ ++The insertion routine has been implemented so as to use the ++dirty hack described in the CACM paper: if a random level is ++generated that is more than the current maximum level, the ++current maximum level plus one is used instead. ++ ++BFS Notes: In this implementation of skiplists, there are bidirectional ++next/prev pointers and the insert function returns a pointer to the actual ++node the value is stored. The key here is chosen by the scheduler so as to ++sort tasks according to the priority list requirements and is no longer used ++by the scheduler after insertion. The scheduler lookup, however, occurs in ++O(1) time because it is always the first item in the level 0 linked list. ++Since the task struct stores a copy of the node pointer upon skiplist_insert, ++it can also remove it much faster than the original implementation with the ++aid of prev<->next pointer manipulation and no searching. ++*/ ++#ifndef _LINUX_SKIP_LIST_H ++#define _LINUX_SKIP_LIST_H ++ ++#include ++ ++#define NUM_SKIPLIST_LEVEL (8) ++ ++struct skiplist_node { ++ int level; /* Levels in this node */ ++ struct skiplist_node *next[NUM_SKIPLIST_LEVEL]; ++ struct skiplist_node *prev[NUM_SKIPLIST_LEVEL]; ++}; ++ ++#define SKIPLIST_NODE_INIT(name) { 0,\ ++ {&name, &name, &name, &name,\ ++ &name, &name, &name, &name},\ ++ {&name, &name, &name, &name,\ ++ &name, &name, &name, &name},\ ++ } ++ ++static inline void INIT_SKIPLIST_NODE(struct skiplist_node *node) ++{ ++ /* only level 0 ->next matters in skiplist_empty()*/ ++ WRITE_ONCE(node->next[0], node); ++} ++ ++/** ++ * FULL_INIT_SKIPLIST_NODE -- fully init a skiplist_node, expecially for header ++ * @node: the skip list node to be inited. ++ */ ++static inline void FULL_INIT_SKIPLIST_NODE(struct skiplist_node *node) ++{ ++ int i; ++ ++ node->level = 0; ++ for (i = 0; i < NUM_SKIPLIST_LEVEL; i++) { ++ WRITE_ONCE(node->next[i], node); ++ node->prev[i] = node; ++ } ++} ++ ++/** ++ * skiplist_empty - test whether a skip list is empty ++ * @head: the skip list to test. ++ */ ++static inline int skiplist_empty(const struct skiplist_node *head) ++{ ++ return READ_ONCE(head->next[0]) == head; ++} ++ ++/** ++ * skiplist_entry - get the struct for this entry ++ * @ptr: the &struct skiplist_node pointer. ++ * @type: the type of the struct this is embedded in. ++ * @member: the name of the skiplist_node within the struct. ++ */ ++#define skiplist_entry(ptr, type, member) \ ++ container_of(ptr, type, member) ++ ++/** ++ * DEFINE_SKIPLIST_INSERT_FUNC -- macro to define a customized skip list insert ++ * function, which takes two parameters, first one is the header node of the ++ * skip list, second one is the skip list node to be inserted ++ * @func_name: the customized skip list insert function name ++ * @search_func: the search function to be used, which takes two parameters, ++ * 1st one is the itrator of skiplist_node in the list, the 2nd is the skip list ++ * node to be inserted, the function should return true if search should be ++ * continued, otherwise return false. ++ * Returns 1 if @node is inserted as the first item of skip list at level zero, ++ * otherwise 0 ++ */ ++#define DEFINE_SKIPLIST_INSERT_FUNC(func_name, search_func)\ ++static inline int func_name(struct skiplist_node *head, struct skiplist_node *node)\ ++{\ ++ struct skiplist_node *update[NUM_SKIPLIST_LEVEL];\ ++ struct skiplist_node *p, *q;\ ++ int k = head->level;\ ++\ ++ p = head;\ ++ do {\ ++ while (q = p->next[k], q != head && search_func(q, node))\ ++ p = q;\ ++ update[k] = p;\ ++ } while (--k >= 0);\ ++\ ++ k = node->level;\ ++ if (unlikely(k > head->level)) {\ ++ node->level = k = ++head->level;\ ++ update[k] = head;\ ++ }\ ++\ ++ do {\ ++ p = update[k];\ ++ q = p->next[k];\ ++ node->next[k] = q;\ ++ p->next[k] = node;\ ++ node->prev[k] = p;\ ++ q->prev[k] = node;\ ++ } while (--k >= 0);\ ++\ ++ return (p == head);\ ++} ++ ++/** ++ * skiplist_del_init -- delete skip list node from a skip list and reset it's ++ * init state ++ * @head: the header node of the skip list to be deleted from. ++ * @node: the skip list node to be deleted, the caller need to ensure @node is ++ * in skip list which @head represent. ++ * Returns 1 if @node is the first item of skip level at level zero, otherwise 0 ++ */ ++static inline int ++skiplist_del_init(struct skiplist_node *head, struct skiplist_node *node) ++{ ++ int l, m = node->level; ++ ++ for (l = 0; l <= m; l++) { ++ node->prev[l]->next[l] = node->next[l]; ++ node->next[l]->prev[l] = node->prev[l]; ++ } ++ if (m == head->level && m > 0) { ++ while (head->next[m] == head && m > 0) ++ m--; ++ head->level = m; ++ } ++ INIT_SKIPLIST_NODE(node); ++ ++ return (node->prev[0] == head); ++} ++#endif /* _LINUX_SKIP_LIST_H */ +diff --git a/include/uapi/linux/sched.h b/include/uapi/linux/sched.h +index 25b4fa00bad1..fc0aabdce15f 100644 +--- a/include/uapi/linux/sched.h ++++ b/include/uapi/linux/sched.h +@@ -84,7 +84,10 @@ struct clone_args { + #define SCHED_FIFO 1 + #define SCHED_RR 2 + #define SCHED_BATCH 3 +-/* SCHED_ISO: reserved but not implemented yet */ ++/* SCHED_ISO: Implemented in BFS/MuQSSPDS only */ ++#ifdef CONFIG_SCHED_PDS ++#define SCHED_ISO 4 ++#endif + #define SCHED_IDLE 5 + #define SCHED_DEADLINE 6 + +diff --git a/init/Kconfig b/init/Kconfig +index b4daad2bac23..ee3b9957cf3b 100644 +--- a/init/Kconfig ++++ b/init/Kconfig +@@ -73,6 +73,21 @@ config THREAD_INFO_IN_TASK + + menu "General setup" + ++config SCHED_PDS ++ bool "PDS-mq cpu scheduler" ++ help ++ The Priority and Deadline based Skip list multiple queue CPU ++ Scheduler for excellent interactivity and responsiveness on the ++ desktop and solid scalability on normal hardware and commodity ++ servers. ++ ++ Currently incompatible with the Group CPU scheduler, and RCU TORTURE ++ TEST so these options are disabled. ++ ++ Say Y here. ++ default y ++ ++ + config BROKEN + bool + +@@ -802,6 +817,7 @@ config NUMA_BALANCING + depends on ARCH_SUPPORTS_NUMA_BALANCING + depends on !ARCH_WANT_NUMA_VARIABLE_LOCALITY + depends on SMP && NUMA && MIGRATION ++ depends on !SCHED_PDS + help + This option adds support for automatic NUMA aware memory/task placement. + The mechanism is quite primitive and is based on migrating memory when +@@ -903,7 +919,7 @@ menuconfig CGROUP_SCHED + bandwidth allocation to such task groups. It uses cgroups to group + tasks. + +-if CGROUP_SCHED ++if CGROUP_SCHED && !SCHED_PDS + config FAIR_GROUP_SCHED + bool "Group scheduling for SCHED_OTHER" + depends on CGROUP_SCHED +@@ -1032,6 +1048,7 @@ config CGROUP_DEVICE + + config CGROUP_CPUACCT + bool "Simple CPU accounting controller" ++ depends on !SCHED_PDS + help + Provides a simple controller for monitoring the + total CPU consumed by the tasks in a cgroup. +@@ -1150,6 +1167,7 @@ config CHECKPOINT_RESTORE + + config SCHED_AUTOGROUP + bool "Automatic process group scheduling" ++ depends on !SCHED_PDS + select CGROUPS + select CGROUP_SCHED + select FAIR_GROUP_SCHED +diff --git a/init/init_task.c b/init/init_task.c +index 9e5cbe5eab7b..89787e2feb60 100644 +--- a/init/init_task.c ++++ b/init/init_task.c +@@ -58,6 +58,126 @@ struct task_struct init_task + __init_task_data + #endif + = { ++#ifdef CONFIG_SCHED_PDS ++#ifdef CONFIG_THREAD_INFO_IN_TASK ++ .thread_info = INIT_THREAD_INFO(init_task), ++ .stack_refcount = ATOMIC_INIT(1), ++#endif ++ .state = 0, ++ .stack = init_stack, ++ .usage = ATOMIC_INIT(2), ++ .flags = PF_KTHREAD, ++ .prio = NORMAL_PRIO, ++ .static_prio = MAX_PRIO - 20, ++ .normal_prio = NORMAL_PRIO, ++ .deadline = 0, /* PDS only */ ++ .policy = SCHED_NORMAL, ++ .cpus_ptr = &init_task.cpus_mask, ++ .cpus_mask = CPU_MASK_ALL, ++ .nr_cpus_allowed= NR_CPUS, ++ .mm = NULL, ++ .active_mm = &init_mm, ++ .restart_block = { ++ .fn = do_no_restart_syscall, ++ }, ++ .sl_level = 0, /* PDS only */ ++ .sl_node = SKIPLIST_NODE_INIT(init_task.sl_node), /* PDS only */ ++ .time_slice = HZ, /* PDS only */ ++ .tasks = LIST_HEAD_INIT(init_task.tasks), ++#ifdef CONFIG_SMP ++ .pushable_tasks = PLIST_NODE_INIT(init_task.pushable_tasks, MAX_PRIO), ++#endif ++#ifdef CONFIG_CGROUP_SCHED ++ .sched_task_group = &root_task_group, ++#endif ++ .ptraced = LIST_HEAD_INIT(init_task.ptraced), ++ .ptrace_entry = LIST_HEAD_INIT(init_task.ptrace_entry), ++ .real_parent = &init_task, ++ .parent = &init_task, ++ .children = LIST_HEAD_INIT(init_task.children), ++ .sibling = LIST_HEAD_INIT(init_task.sibling), ++ .group_leader = &init_task, ++ RCU_POINTER_INITIALIZER(real_cred, &init_cred), ++ RCU_POINTER_INITIALIZER(cred, &init_cred), ++ .comm = INIT_TASK_COMM, ++ .thread = INIT_THREAD, ++ .fs = &init_fs, ++ .files = &init_files, ++ .signal = &init_signals, ++ .sighand = &init_sighand, ++ .nsproxy = &init_nsproxy, ++ .pending = { ++ .list = LIST_HEAD_INIT(init_task.pending.list), ++ .signal = {{0}} ++ }, ++ .blocked = {{0}}, ++ .alloc_lock = __SPIN_LOCK_UNLOCKED(init_task.alloc_lock), ++ .journal_info = NULL, ++ INIT_CPU_TIMERS(init_task) ++ .pi_lock = __RAW_SPIN_LOCK_UNLOCKED(init_task.pi_lock), ++ .timer_slack_ns = 50000, /* 50 usec default slack */ ++ .thread_pid = &init_struct_pid, ++ .thread_group = LIST_HEAD_INIT(init_task.thread_group), ++ .thread_node = LIST_HEAD_INIT(init_signals.thread_head), ++#ifdef CONFIG_AUDITSYSCALL ++ .loginuid = INVALID_UID, ++ .sessionid = AUDIT_SID_UNSET, ++#endif ++#ifdef CONFIG_PERF_EVENTS ++ .perf_event_mutex = __MUTEX_INITIALIZER(init_task.perf_event_mutex), ++ .perf_event_list = LIST_HEAD_INIT(init_task.perf_event_list), ++#endif ++#ifdef CONFIG_PREEMPT_RCU ++ .rcu_read_lock_nesting = 0, ++ .rcu_read_unlock_special.s = 0, ++ .rcu_node_entry = LIST_HEAD_INIT(init_task.rcu_node_entry), ++ .rcu_blocked_node = NULL, ++#endif ++#ifdef CONFIG_TASKS_RCU ++ .rcu_tasks_holdout = false, ++ .rcu_tasks_holdout_list = LIST_HEAD_INIT(init_task.rcu_tasks_holdout_list), ++ .rcu_tasks_idle_cpu = -1, ++#endif ++#ifdef CONFIG_CPUSETS ++ .mems_allowed_seq = SEQCNT_ZERO(init_task.mems_allowed_seq), ++#endif ++#ifdef CONFIG_RT_MUTEXES ++ .pi_waiters = RB_ROOT_CACHED, ++ .pi_top_task = NULL, ++#endif ++ INIT_PREV_CPUTIME(init_task) ++#ifdef CONFIG_VIRT_CPU_ACCOUNTING_GEN ++ .vtime.seqcount = SEQCNT_ZERO(init_task.vtime_seqcount), ++ .vtime.starttime = 0, ++ .vtime.state = VTIME_SYS, ++#endif ++#ifdef CONFIG_NUMA_BALANCING ++ .numa_preferred_nid = -1, ++ .numa_group = NULL, ++ .numa_faults = NULL, ++#endif ++#ifdef CONFIG_KASAN ++ .kasan_depth = 1, ++#endif ++#ifdef CONFIG_TRACE_IRQFLAGS ++ .softirqs_enabled = 1, ++#endif ++#ifdef CONFIG_LOCKDEP ++ .lockdep_recursion = 0, ++#endif ++#ifdef CONFIG_FUNCTION_GRAPH_TRACER ++ .ret_stack = NULL, ++#endif ++#if defined(CONFIG_TRACING) && defined(CONFIG_PREEMPT) ++ .trace_recursion = 0, ++#endif ++#ifdef CONFIG_LIVEPATCH ++ .patch_state = KLP_UNDEFINED, ++#endif ++#ifdef CONFIG_SECURITY ++ .security = NULL, ++#endif ++#else /* CONFIG_SCHED_PDS */ + #ifdef CONFIG_THREAD_INFO_IN_TASK + .thread_info = INIT_THREAD_INFO(init_task), + .stack_refcount = REFCOUNT_INIT(1), +@@ -181,6 +301,7 @@ struct task_struct init_task + #ifdef CONFIG_SECURITY + .security = NULL, + #endif ++#endif /* CONFIG_SCHED_PDS */ + }; + EXPORT_SYMBOL(init_task); + +diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c +index c87ee6412b36..4045c8532027 100644 +--- a/kernel/cgroup/cpuset.c ++++ b/kernel/cgroup/cpuset.c +@@ -632,7 +632,7 @@ static int validate_change(struct cpuset *cur, struct cpuset *trial) + return ret; + } + +-#ifdef CONFIG_SMP ++#if defined(CONFIG_SMP) && !defined(CONFIG_SCHED_PDS) + /* + * Helper routine for generate_sched_domains(). + * Do cpusets a, b have overlapping effective cpus_allowed masks? +@@ -1007,7 +1007,7 @@ static void rebuild_sched_domains_locked(void) + /* Have scheduler rebuild the domains */ + partition_and_rebuild_sched_domains(ndoms, doms, attr); + } +-#else /* !CONFIG_SMP */ ++#else /* !CONFIG_SMP || CONFIG_SCHED_PDS */ + static void rebuild_sched_domains_locked(void) + { + } +diff --git a/kernel/delayacct.c b/kernel/delayacct.c +index 27725754ac99..769d773c7182 100644 +--- a/kernel/delayacct.c ++++ b/kernel/delayacct.c +@@ -106,7 +106,7 @@ int __delayacct_add_tsk(struct taskstats *d, struct task_struct *tsk) + */ + t1 = tsk->sched_info.pcount; + t2 = tsk->sched_info.run_delay; +- t3 = tsk->se.sum_exec_runtime; ++ t3 = tsk_seruntime(tsk); + + d->cpu_count += t1; + +diff --git a/kernel/exit.c b/kernel/exit.c +index a46a50d67002..58043176b285 100644 +--- a/kernel/exit.c ++++ b/kernel/exit.c +@@ -131,7 +131,7 @@ static void __exit_signal(struct task_struct *tsk) + sig->curr_target = next_thread(tsk); + } + +- add_device_randomness((const void*) &tsk->se.sum_exec_runtime, ++ add_device_randomness((const void*) &tsk_seruntime(tsk), + sizeof(unsigned long long)); + + /* +@@ -152,7 +152,7 @@ static void __exit_signal(struct task_struct *tsk) + sig->inblock += task_io_get_inblock(tsk); + sig->oublock += task_io_get_oublock(tsk); + task_io_accounting_add(&sig->ioac, &tsk->ioac); +- sig->sum_sched_runtime += tsk->se.sum_exec_runtime; ++ sig->sum_sched_runtime += tsk_seruntime(tsk); + sig->nr_threads--; + __unhash_process(tsk, group_dead); + write_sequnlock(&sig->stats_lock); +diff --git a/kernel/livepatch/transition.c b/kernel/livepatch/transition.c +index cdf318d86dd6..baa525865d5c 100644 +--- a/kernel/livepatch/transition.c ++++ b/kernel/livepatch/transition.c +@@ -306,7 +306,11 @@ static bool klp_try_switch_task(struct task_struct *task) + */ + rq = task_rq_lock(task, &flags); + ++#ifdef CONFIG_SCHED_PDS ++ if (task_running(task) && task != current) { ++#else + if (task_running(rq, task) && task != current) { ++#endif + snprintf(err_buf, STACK_ERR_BUF_SIZE, + "%s: %s:%d is running\n", __func__, task->comm, + task->pid); +diff --git a/kernel/locking/rtmutex.c b/kernel/locking/rtmutex.c +index 2874bf556162..fad8a279fdfa 100644 +--- a/kernel/locking/rtmutex.c ++++ b/kernel/locking/rtmutex.c +@@ -229,7 +229,7 @@ static inline bool unlock_rt_mutex_safe(struct rt_mutex *lock, + * Only use with rt_mutex_waiter_{less,equal}() + */ + #define task_to_waiter(p) \ +- &(struct rt_mutex_waiter){ .prio = (p)->prio, .deadline = (p)->dl.deadline } ++ &(struct rt_mutex_waiter){ .prio = (p)->prio, .deadline = __tsk_deadline(p) } + + static inline int + rt_mutex_waiter_less(struct rt_mutex_waiter *left, +@@ -680,7 +680,7 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task, + * the values of the node being removed. + */ + waiter->prio = task->prio; +- waiter->deadline = task->dl.deadline; ++ waiter->deadline = __tsk_deadline(task); + + rt_mutex_enqueue(lock, waiter); + +@@ -953,7 +953,7 @@ static int task_blocks_on_rt_mutex(struct rt_mutex *lock, + waiter->task = task; + waiter->lock = lock; + waiter->prio = task->prio; +- waiter->deadline = task->dl.deadline; ++ waiter->deadline = __tsk_deadline(task); + + /* Get the top priority waiter on the lock */ + if (rt_mutex_has_waiters(lock)) +diff --git a/kernel/sched/Makefile b/kernel/sched/Makefile +index 21fb5a5662b5..8ebe4e33fb5f 100644 +--- a/kernel/sched/Makefile ++++ b/kernel/sched/Makefile +@@ -16,15 +16,21 @@ ifneq ($(CONFIG_SCHED_OMIT_FRAME_POINTER),y) + CFLAGS_core.o := $(PROFILING) -fno-omit-frame-pointer + endif + +-obj-y += core.o loadavg.o clock.o cputime.o +-obj-y += idle.o fair.o rt.o deadline.o +-obj-y += wait.o wait_bit.o swait.o completion.o +- +-obj-$(CONFIG_SMP) += cpupri.o cpudeadline.o topology.o stop_task.o pelt.o ++ifdef CONFIG_SCHED_PDS ++obj-y += pds.o ++else ++obj-y += core.o ++obj-y += fair.o rt.o deadline.o ++obj-$(CONFIG_SMP) += cpudeadline.o topology.o stop_task.o + obj-$(CONFIG_SCHED_AUTOGROUP) += autogroup.o +-obj-$(CONFIG_SCHEDSTATS) += stats.o + obj-$(CONFIG_SCHED_DEBUG) += debug.o + obj-$(CONFIG_CGROUP_CPUACCT) += cpuacct.o ++endif ++obj-y += loadavg.o clock.o cputime.o ++obj-y += idle.o ++obj-y += wait.o wait_bit.o swait.o completion.o ++obj-$(CONFIG_SMP) += cpupri.o pelt.o ++obj-$(CONFIG_SCHEDSTATS) += stats.o + obj-$(CONFIG_CPU_FREQ) += cpufreq.o + obj-$(CONFIG_CPU_FREQ_GOV_SCHEDUTIL) += cpufreq_schedutil.o + obj-$(CONFIG_MEMBARRIER) += membarrier.o +diff --git a/kernel/sched/cpufreq_schedutil.c b/kernel/sched/cpufreq_schedutil.c +index 86800b4d5453..07f278dc3137 100644 +--- a/kernel/sched/cpufreq_schedutil.c ++++ b/kernel/sched/cpufreq_schedutil.c +@@ -185,6 +185,7 @@ static unsigned int get_next_freq(struct sugov_policy *sg_policy, + return cpufreq_driver_resolve_freq(policy, freq); + } + ++#ifndef CONFIG_SCHED_PDS + /* + * This function computes an effective utilization for the given CPU, to be + * used for frequency selection given the linear relation: f = u * f_max. +@@ -302,6 +303,13 @@ static unsigned long sugov_get_util(struct sugov_cpu *sg_cpu) + + return schedutil_cpu_util(sg_cpu->cpu, util, max, FREQUENCY_UTIL, NULL); + } ++#else /* CONFIG_SCHED_PDS */ ++static unsigned long sugov_get_util(struct sugov_cpu *sg_cpu) ++{ ++ sg_cpu->max = arch_scale_cpu_capacity(sg_cpu->cpu); ++ return sg_cpu->max; ++} ++#endif + + /** + * sugov_iowait_reset() - Reset the IO boost status of a CPU. +@@ -445,7 +453,9 @@ static inline bool sugov_cpu_is_busy(struct sugov_cpu *sg_cpu) { return false; } + */ + static inline void ignore_dl_rate_limit(struct sugov_cpu *sg_cpu, struct sugov_policy *sg_policy) + { ++#ifndef CONFIG_SCHED_PDS + if (cpu_bw_dl(cpu_rq(sg_cpu->cpu)) > sg_cpu->bw_dl) ++#endif + sg_policy->limits_changed = true; + } + +@@ -688,6 +698,7 @@ static int sugov_kthread_create(struct sugov_policy *sg_policy) + } + + ret = sched_setattr_nocheck(thread, &attr); ++ + if (ret) { + kthread_stop(thread); + pr_warn("%s: failed to set SCHED_DEADLINE\n", __func__); +@@ -918,6 +929,7 @@ static int __init sugov_register(void) + fs_initcall(sugov_register); + + #ifdef CONFIG_ENERGY_MODEL ++#ifndef CONFIG_SCHED_PDS + extern bool sched_energy_update; + extern struct mutex sched_energy_mutex; + +@@ -948,4 +960,10 @@ void sched_cpufreq_governor_change(struct cpufreq_policy *policy, + } + + } ++#else /* CONFIG_SCHED_PDS */ ++void sched_cpufreq_governor_change(struct cpufreq_policy *policy, ++ struct cpufreq_governor *old_gov) ++{ ++} ++#endif + #endif +diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c +index 46ed4e1383e2..0a9548ee995c 100644 +--- a/kernel/sched/cputime.c ++++ b/kernel/sched/cputime.c +@@ -122,7 +122,12 @@ void account_user_time(struct task_struct *p, u64 cputime) + p->utime += cputime; + account_group_user_time(p, cputime); + ++#ifdef CONFIG_SCHED_PDS ++ index = (task_nice(p) > 0 || task_running_idle(p)) ? CPUTIME_NICE : ++ CPUTIME_USER; ++#else + index = (task_nice(p) > 0) ? CPUTIME_NICE : CPUTIME_USER; ++#endif + + /* Add user time to cpustat. */ + task_group_account_field(p, index, cputime); +@@ -146,7 +151,11 @@ void account_guest_time(struct task_struct *p, u64 cputime) + p->gtime += cputime; + + /* Add guest time to cpustat. */ ++#ifdef CONFIG_SCHED_PDS ++ if (task_nice(p) > 0 || task_running_idle(p)) { ++#else + if (task_nice(p) > 0) { ++#endif + cpustat[CPUTIME_NICE] += cputime; + cpustat[CPUTIME_GUEST_NICE] += cputime; + } else { +@@ -269,7 +278,7 @@ static inline u64 account_other_time(u64 max) + #ifdef CONFIG_64BIT + static inline u64 read_sum_exec_runtime(struct task_struct *t) + { +- return t->se.sum_exec_runtime; ++ return tsk_seruntime(t); + } + #else + static u64 read_sum_exec_runtime(struct task_struct *t) +@@ -279,7 +288,7 @@ static u64 read_sum_exec_runtime(struct task_struct *t) + struct rq *rq; + + rq = task_rq_lock(t, &rf); +- ns = t->se.sum_exec_runtime; ++ ns = tsk_seruntime(t); + task_rq_unlock(rq, t, &rf); + + return ns; +@@ -663,7 +672,7 @@ void cputime_adjust(struct task_cputime *curr, struct prev_cputime *prev, + void task_cputime_adjusted(struct task_struct *p, u64 *ut, u64 *st) + { + struct task_cputime cputime = { +- .sum_exec_runtime = p->se.sum_exec_runtime, ++ .sum_exec_runtime = tsk_seruntime(p), + }; + + task_cputime(p, &cputime.utime, &cputime.stime); +diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c +index f65ef1e2f204..454fa7e460e3 100644 +--- a/kernel/sched/idle.c ++++ b/kernel/sched/idle.c +@@ -355,6 +355,7 @@ void cpu_startup_entry(enum cpuhp_state state) + do_idle(); + } + ++#ifndef CONFIG_SCHED_PDS + /* + * idle-task scheduling class. + */ +@@ -479,3 +480,4 @@ const struct sched_class idle_sched_class = { + .switched_to = switched_to_idle, + .update_curr = update_curr_idle, + }; ++#endif +diff --git a/kernel/sched/pds.c b/kernel/sched/pds.c +new file mode 100644 +index 000000000000..aefbd9cebcfb +--- /dev/null ++++ b/kernel/sched/pds.c +@@ -0,0 +1,6566 @@ ++/* ++ * kernel/sched/pds.c, was kernel/sched.c ++ * ++ * PDS-mq Core kernel scheduler code and related syscalls ++ * ++ * Copyright (C) 1991-2002 Linus Torvalds ++ * ++ * 2009-08-13 Brainfuck deadline scheduling policy by Con Kolivas deletes ++ * a whole lot of those previous things. ++ * 2017-09-06 Priority and Deadline based Skip list multiple queue kernel ++ * scheduler by Alfred Chen. ++ */ ++#include "pds_sched.h" ++ ++#include ++ ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++ ++#include ++ ++#include ++ ++#include "../workqueue_internal.h" ++#include "../smpboot.h" ++ ++#include "pelt.h" ++ ++#define CREATE_TRACE_POINTS ++#include ++ ++ ++#define rt_prio(prio) ((prio) < MAX_RT_PRIO) ++#define rt_task(p) rt_prio((p)->prio) ++#define rt_policy(policy) ((policy) == SCHED_FIFO || \ ++ (policy) == SCHED_RR || \ ++ (policy) == SCHED_ISO) ++#define task_has_rt_policy(p) (rt_policy((p)->policy)) ++ ++#define idle_policy(policy) ((policy) == SCHED_IDLE) ++#define idleprio_task(p) unlikely(idle_policy((p)->policy)) ++ ++#define STOP_PRIO (MAX_RT_PRIO - 1) ++ ++/* ++ * Some helpers for converting to/from various scales. Use shifts to get ++ * approximate multiples of ten for less overhead. ++ */ ++#define JIFFIES_TO_NS(TIME) ((TIME) * (1000000000 / HZ)) ++#define JIFFY_NS (1000000000 / HZ) ++#define HALF_JIFFY_NS (1000000000 / HZ / 2) ++#define HALF_JIFFY_US (1000000 / HZ / 2) ++#define MS_TO_NS(TIME) ((TIME) << 20) ++#define MS_TO_US(TIME) ((TIME) << 10) ++#define NS_TO_MS(TIME) ((TIME) >> 20) ++#define NS_TO_US(TIME) ((TIME) >> 10) ++#define US_TO_NS(TIME) ((TIME) << 10) ++ ++#define RESCHED_US (100) /* Reschedule if less than this many μs left */ ++ ++enum { ++ BASE_CPU_AFFINITY_CHK_LEVEL = 1, ++#ifdef CONFIG_SCHED_SMT ++ SMT_CPU_AFFINITY_CHK_LEVEL_SPACE_HOLDER, ++#endif ++#ifdef CONFIG_SCHED_MC ++ MC_CPU_AFFINITY_CHK_LEVEL_SPACE_HOLDER, ++#endif ++ NR_CPU_AFFINITY_CHK_LEVEL ++}; ++ ++static inline void print_scheduler_version(void) ++{ ++ printk(KERN_INFO "pds: PDS-mq CPU Scheduler 0.99o by Alfred Chen and kept alive artificially by Tk-Glitch.\n"); ++} ++ ++/* ++ * This is the time all tasks within the same priority round robin. ++ * Value is in ms and set to a minimum of 6ms. Scales with number of cpus. ++ * Tunable via /proc interface. ++ */ ++#define SCHED_DEFAULT_RR (4) ++int rr_interval __read_mostly = SCHED_DEFAULT_RR; ++ ++static int __init rr_interval_set(char *str) ++{ ++ u32 rr; ++ ++ pr_info("rr_interval: "); ++ if (kstrtouint(str, 0, &rr)) { ++ pr_cont("using default of %u, unable to parse %s\n", ++ rr_interval, str); ++ return 1; ++ } ++ ++ rr_interval = rr; ++ pr_cont("%d\n", rr_interval); ++ ++ return 1; ++} ++__setup("rr_interval=", rr_interval_set); ++ ++ ++static const u64 sched_prio2deadline[NICE_WIDTH] = { ++/* -20 */ 6291456, 6920601, 7612661, 8373927, 9211319, ++/* -15 */ 10132450, 11145695, 12260264, 13486290, 14834919, ++/* -10 */ 16318410, 17950251, 19745276, 21719803, 23891783, ++/* -5 */ 26280961, 28909057, 31799962, 34979958, 38477953, ++/* 0 */ 42325748, 46558322, 51214154, 56335569, 61969125, ++/* 5 */ 68166037, 74982640, 82480904, 90728994, 99801893, ++/* 10 */ 109782082, 120760290, 132836319, 146119950, 160731945, ++/* 15 */ 176805139, 194485652, 213934217, 235327638, 258860401 ++}; ++ ++/** ++ * sched_yield_type - Choose what sort of yield sched_yield will perform. ++ * 0: No yield. ++ * 1: Yield only to better priority/deadline tasks. (default) ++ * 2: Expire timeslice and recalculate deadline. ++ */ ++int sched_yield_type __read_mostly = 1; ++ ++/* ++ * The quota handed out to tasks of all priority levels when refilling their ++ * time_slice. ++ */ ++static inline int timeslice(void) ++{ ++ return MS_TO_US(rr_interval); ++} ++ ++#ifdef CONFIG_SMP ++enum { ++SCHED_RQ_EMPTY = 0, ++SCHED_RQ_IDLE, ++SCHED_RQ_NORMAL_0, ++SCHED_RQ_NORMAL_1, ++SCHED_RQ_NORMAL_2, ++SCHED_RQ_NORMAL_3, ++SCHED_RQ_NORMAL_4, ++SCHED_RQ_NORMAL_5, ++SCHED_RQ_NORMAL_6, ++SCHED_RQ_NORMAL_7, ++SCHED_RQ_ISO, ++SCHED_RQ_RT, ++NR_SCHED_RQ_QUEUED_LEVEL ++}; ++ ++static cpumask_t sched_rq_queued_masks[NR_SCHED_RQ_QUEUED_LEVEL] ++____cacheline_aligned_in_smp; ++ ++static DECLARE_BITMAP(sched_rq_queued_masks_bitmap, NR_SCHED_RQ_QUEUED_LEVEL) ++____cacheline_aligned_in_smp; ++ ++static cpumask_t sched_rq_pending_masks[NR_SCHED_RQ_QUEUED_LEVEL] ++____cacheline_aligned_in_smp; ++ ++static DECLARE_BITMAP(sched_rq_pending_masks_bitmap, NR_SCHED_RQ_QUEUED_LEVEL) ++____cacheline_aligned_in_smp; ++ ++DEFINE_PER_CPU(cpumask_t [NR_CPU_AFFINITY_CHK_LEVEL], sched_cpu_affinity_chk_masks); ++DEFINE_PER_CPU(cpumask_t *, sched_cpu_llc_start_mask); ++DEFINE_PER_CPU(cpumask_t *, sched_cpu_affinity_chk_end_masks); ++ ++#ifdef CONFIG_SCHED_SMT ++DEFINE_PER_CPU(int, sched_sibling_cpu); ++DEFINE_STATIC_KEY_FALSE(sched_smt_present); ++EXPORT_SYMBOL_GPL(sched_smt_present); ++ ++static cpumask_t sched_cpu_sg_idle_mask ____cacheline_aligned_in_smp; ++ ++#ifdef CONFIG_SMT_NICE ++/* ++ * Preemptible sibling group mask ++ * Which all sibling cpus are running at PRIO_LIMIT or IDLE_PRIO ++ */ ++static cpumask_t sched_cpu_psg_mask ____cacheline_aligned_in_smp; ++/* ++ * SMT supressed mask ++ * When a cpu is running task with NORMAL/ISO/RT policy, its sibling cpu ++ * will be supressed to run IDLE priority task. ++ */ ++static cpumask_t sched_smt_supressed_mask ____cacheline_aligned_in_smp; ++#endif /* CONFIG_SMT_NICE */ ++#endif ++ ++static int sched_rq_prio[NR_CPUS] ____cacheline_aligned; ++ ++/* ++ * Keep a unique ID per domain (we use the first CPUs number in the cpumask of ++ * the domain), this allows us to quickly tell if two cpus are in the same cache ++ * domain, see cpus_share_cache(). ++ */ ++DEFINE_PER_CPU(int, sd_llc_id); ++ ++int __weak arch_sd_sibling_asym_packing(void) ++{ ++ return 0*SD_ASYM_PACKING; ++} ++#else ++struct rq *uprq; ++#endif /* CONFIG_SMP */ ++ ++static DEFINE_MUTEX(sched_hotcpu_mutex); ++ ++DEFINE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues); ++ ++#ifndef prepare_arch_switch ++# define prepare_arch_switch(next) do { } while (0) ++#endif ++#ifndef finish_arch_post_lock_switch ++# define finish_arch_post_lock_switch() do { } while (0) ++#endif ++ ++/* ++ * Context: p->pi_lock ++ */ ++static inline struct rq ++*__task_access_lock(struct task_struct *p, raw_spinlock_t **plock) ++{ ++ struct rq *rq; ++ for (;;) { ++ rq = task_rq(p); ++ if (p->on_cpu || task_on_rq_queued(p)) { ++ raw_spin_lock(&rq->lock); ++ if (likely((p->on_cpu || task_on_rq_queued(p)) ++ && rq == task_rq(p))) { ++ *plock = &rq->lock; ++ return rq; ++ } ++ raw_spin_unlock(&rq->lock); ++ } else if (task_on_rq_migrating(p)) { ++ do { ++ cpu_relax(); ++ } while (unlikely(task_on_rq_migrating(p))); ++ } else { ++ *plock = NULL; ++ return rq; ++ } ++ } ++} ++ ++static inline void ++__task_access_unlock(struct task_struct *p, raw_spinlock_t *lock) ++{ ++ if (NULL != lock) ++ raw_spin_unlock(lock); ++} ++ ++static inline struct rq ++*task_access_lock_irqsave(struct task_struct *p, raw_spinlock_t **plock, ++ unsigned long *flags) ++{ ++ struct rq *rq; ++ for (;;) { ++ rq = task_rq(p); ++ if (p->on_cpu || task_on_rq_queued(p)) { ++ raw_spin_lock_irqsave(&rq->lock, *flags); ++ if (likely((p->on_cpu || task_on_rq_queued(p)) ++ && rq == task_rq(p))) { ++ *plock = &rq->lock; ++ return rq; ++ } ++ raw_spin_unlock_irqrestore(&rq->lock, *flags); ++ } else if (task_on_rq_migrating(p)) { ++ do { ++ cpu_relax(); ++ } while (unlikely(task_on_rq_migrating(p))); ++ } else { ++ raw_spin_lock_irqsave(&p->pi_lock, *flags); ++ if (likely(!p->on_cpu && !p->on_rq && ++ rq == task_rq(p))) { ++ *plock = &p->pi_lock; ++ return rq; ++ } ++ raw_spin_unlock_irqrestore(&p->pi_lock, *flags); ++ } ++ } ++} ++ ++static inline void ++task_access_unlock_irqrestore(struct task_struct *p, raw_spinlock_t *lock, ++ unsigned long *flags) ++{ ++ raw_spin_unlock_irqrestore(lock, *flags); ++} ++ ++/* ++ * __task_rq_lock - lock the rq @p resides on. ++ */ ++struct rq *__task_rq_lock(struct task_struct *p, struct rq_flags *rf) ++ __acquires(rq->lock) ++{ ++ struct rq *rq; ++ ++ lockdep_assert_held(&p->pi_lock); ++ ++ for (;;) { ++ rq = task_rq(p); ++ raw_spin_lock(&rq->lock); ++ if (likely(rq == task_rq(p) && !task_on_rq_migrating(p))) ++ return rq; ++ raw_spin_unlock(&rq->lock); ++ ++ while (unlikely(task_on_rq_migrating(p))) ++ cpu_relax(); ++ } ++} ++ ++/* ++ * task_rq_lock - lock p->pi_lock and lock the rq @p resides on. ++ */ ++struct rq *task_rq_lock(struct task_struct *p, struct rq_flags *rf) ++ __acquires(p->pi_lock) ++ __acquires(rq->lock) ++{ ++ struct rq *rq; ++ ++ for (;;) { ++ raw_spin_lock_irqsave(&p->pi_lock, rf->flags); ++ rq = task_rq(p); ++ raw_spin_lock(&rq->lock); ++ /* ++ * move_queued_task() task_rq_lock() ++ * ++ * ACQUIRE (rq->lock) ++ * [S] ->on_rq = MIGRATING [L] rq = task_rq() ++ * WMB (__set_task_cpu()) ACQUIRE (rq->lock); ++ * [S] ->cpu = new_cpu [L] task_rq() ++ * [L] ->on_rq ++ * RELEASE (rq->lock) ++ * ++ * If we observe the old CPU in task_rq_lock(), the acquire of ++ * the old rq->lock will fully serialize against the stores. ++ * ++ * If we observe the new CPU in task_rq_lock(), the address ++ * dependency headed by '[L] rq = task_rq()' and the acquire ++ * will pair with the WMB to ensure we then also see migrating. ++ */ ++ if (likely(rq == task_rq(p) && !task_on_rq_migrating(p))) { ++ return rq; ++ } ++ raw_spin_unlock(&rq->lock); ++ raw_spin_unlock_irqrestore(&p->pi_lock, rf->flags); ++ ++ while (unlikely(task_on_rq_migrating(p))) ++ cpu_relax(); ++ } ++} ++ ++/* ++ * RQ-clock updating methods: ++ */ ++ ++static void update_rq_clock_task(struct rq *rq, s64 delta) ++{ ++/* ++ * In theory, the compile should just see 0 here, and optimize out the call ++ * to sched_rt_avg_update. But I don't trust it... ++ */ ++ s64 __maybe_unused steal = 0, irq_delta = 0; ++ ++#ifdef CONFIG_IRQ_TIME_ACCOUNTING ++ irq_delta = irq_time_read(cpu_of(rq)) - rq->prev_irq_time; ++ ++ /* ++ * Since irq_time is only updated on {soft,}irq_exit, we might run into ++ * this case when a previous update_rq_clock() happened inside a ++ * {soft,}irq region. ++ * ++ * When this happens, we stop ->clock_task and only update the ++ * prev_irq_time stamp to account for the part that fit, so that a next ++ * update will consume the rest. This ensures ->clock_task is ++ * monotonic. ++ * ++ * It does however cause some slight miss-attribution of {soft,}irq ++ * time, a more accurate solution would be to update the irq_time using ++ * the current rq->clock timestamp, except that would require using ++ * atomic ops. ++ */ ++ if (irq_delta > delta) ++ irq_delta = delta; ++ ++ rq->prev_irq_time += irq_delta; ++ delta -= irq_delta; ++#endif ++#ifdef CONFIG_PARAVIRT_TIME_ACCOUNTING ++ if (static_key_false((¶virt_steal_rq_enabled))) { ++ steal = paravirt_steal_clock(cpu_of(rq)); ++ steal -= rq->prev_steal_time_rq; ++ ++ if (unlikely(steal > delta)) ++ steal = delta; ++ ++ rq->prev_steal_time_rq += steal; ++ ++ delta -= steal; ++ } ++#endif ++ ++ rq->clock_task += delta; ++ ++#ifdef CONFIG_HAVE_SCHED_AVG_IRQ ++ if ((irq_delta + steal)) ++ update_irq_load_avg(rq, irq_delta + steal); ++#endif ++} ++ ++static inline void update_rq_clock(struct rq *rq) ++{ ++ s64 delta = sched_clock_cpu(cpu_of(rq)) - rq->clock; ++ ++ if (unlikely(delta <= 0)) ++ return; ++ rq->clock += delta; ++ update_rq_clock_task(rq, delta); ++} ++ ++static inline void update_task_priodl(struct task_struct *p) ++{ ++ p->priodl = (((u64) (p->prio))<<56) | ((p->deadline)>>8); ++} ++ ++/* ++ * Deadline is "now" in niffies + (offset by priority). Setting the deadline ++ * is the key to everything. It distributes CPU fairly amongst tasks of the ++ * same nice value, it proportions CPU according to nice level, it means the ++ * task that last woke up the longest ago has the earliest deadline, thus ++ * ensuring that interactive tasks get low latency on wake up. The CPU ++ * proportion works out to the square of the virtual deadline difference, so ++ * this equation will give nice 19 3% CPU compared to nice 0. ++ */ ++static inline u64 task_deadline_diff(const struct task_struct *p) ++{ ++ return sched_prio2deadline[TASK_USER_PRIO(p)]; ++} ++ ++static inline u64 static_deadline_diff(int static_prio) ++{ ++ return sched_prio2deadline[USER_PRIO(static_prio)]; ++} ++ ++/* ++ * The time_slice is only refilled when it is empty and that is when we set a ++ * new deadline for non-rt tasks. ++ */ ++static inline void time_slice_expired(struct task_struct *p, struct rq *rq) ++{ ++ p->time_slice = timeslice(); ++ if (p->prio >= NORMAL_PRIO) ++ p->deadline = rq->clock + task_deadline_diff(p); ++ ++ update_task_priodl(p); ++} ++ ++static inline struct task_struct *rq_first_queued_task(struct rq *rq) ++{ ++ struct skiplist_node *node = rq->sl_header.next[0]; ++ ++ if (node == &rq->sl_header) ++ return rq->idle; ++ ++ return skiplist_entry(node, struct task_struct, sl_node); ++} ++ ++static inline struct task_struct *rq_second_queued_task(struct rq *rq) ++{ ++ struct skiplist_node *node = rq->sl_header.next[0]->next[0]; ++ ++ if (node == &rq->sl_header) ++ return rq->idle; ++ ++ return skiplist_entry(node, struct task_struct, sl_node); ++} ++ ++static inline int is_second_in_rq(struct task_struct *p, struct rq *rq) ++{ ++ return (p->sl_node.prev[0]->prev[0] == &rq->sl_header); ++} ++ ++static const int task_dl_hash_tbl[] = { ++/* 0 4 8 12 */ ++ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, ++/* 16 20 24 28 */ ++ 1, 1, 1, 2, 2, 2, 2, 2, 3, 3, 3, 4, 4, 5, 6, 7 ++}; ++ ++static inline int ++task_deadline_level(const struct task_struct *p, const struct rq *rq) ++{ ++ u64 delta = (rq->clock + sched_prio2deadline[39] - p->deadline) >> 23; ++ ++ delta = min((size_t)delta, ARRAY_SIZE(task_dl_hash_tbl) - 1); ++ return task_dl_hash_tbl[delta]; ++} ++ ++/* ++ * cmpxchg based fetch_or, macro so it works for different integer types ++ */ ++#define fetch_or(ptr, mask) \ ++ ({ \ ++ typeof(ptr) _ptr = (ptr); \ ++ typeof(mask) _mask = (mask); \ ++ typeof(*_ptr) _old, _val = *_ptr; \ ++ \ ++ for (;;) { \ ++ _old = cmpxchg(_ptr, _val, _val | _mask); \ ++ if (_old == _val) \ ++ break; \ ++ _val = _old; \ ++ } \ ++ _old; \ ++}) ++ ++#if defined(CONFIG_SMP) && defined(TIF_POLLING_NRFLAG) ++/* ++ * Atomically set TIF_NEED_RESCHED and test for TIF_POLLING_NRFLAG, ++ * this avoids any races wrt polling state changes and thereby avoids ++ * spurious IPIs. ++ */ ++static bool set_nr_and_not_polling(struct task_struct *p) ++{ ++ struct thread_info *ti = task_thread_info(p); ++ return !(fetch_or(&ti->flags, _TIF_NEED_RESCHED) & _TIF_POLLING_NRFLAG); ++} ++ ++/* ++ * Atomically set TIF_NEED_RESCHED if TIF_POLLING_NRFLAG is set. ++ * ++ * If this returns true, then the idle task promises to call ++ * sched_ttwu_pending() and reschedule soon. ++ */ ++static bool set_nr_if_polling(struct task_struct *p) ++{ ++ struct thread_info *ti = task_thread_info(p); ++ typeof(ti->flags) old, val = READ_ONCE(ti->flags); ++ ++ for (;;) { ++ if (!(val & _TIF_POLLING_NRFLAG)) ++ return false; ++ if (val & _TIF_NEED_RESCHED) ++ return true; ++ old = cmpxchg(&ti->flags, val, val | _TIF_NEED_RESCHED); ++ if (old == val) ++ break; ++ val = old; ++ } ++ return true; ++} ++ ++#else ++static bool set_nr_and_not_polling(struct task_struct *p) ++{ ++ set_tsk_need_resched(p); ++ return true; ++} ++ ++#ifdef CONFIG_SMP ++static bool set_nr_if_polling(struct task_struct *p) ++{ ++ return false; ++} ++#endif ++#endif ++ ++#ifdef CONFIG_SMP ++#ifdef CONFIG_SMT_NICE ++static void resched_cpu_if_curr_is(int cpu, int priority) ++{ ++ struct rq *rq = cpu_rq(cpu); ++ ++ rcu_read_lock(); ++ ++ if (rcu_dereference(rq->curr)->prio != priority) ++ goto out; ++ ++ if (set_nr_if_polling(rq->idle)) { ++ trace_sched_wake_idle_without_ipi(cpu); ++ } else { ++ if (!do_raw_spin_trylock(&rq->lock)) ++ goto out; ++ spin_acquire(&rq->lock.dep_map, SINGLE_DEPTH_NESTING, 1, _RET_IP_); ++ ++ if (priority == rq->curr->prio) ++ smp_send_reschedule(cpu); ++ /* Else CPU is not idle, do nothing here */ ++ ++ spin_release(&rq->lock.dep_map, 1, _RET_IP_); ++ do_raw_spin_unlock(&rq->lock); ++ } ++ ++out: ++ rcu_read_unlock(); ++} ++#endif /* CONFIG_SMT_NICE */ ++ ++static inline bool ++__update_cpumasks_bitmap(int cpu, unsigned long *plevel, unsigned long level, ++ cpumask_t cpumasks[], unsigned long bitmap[]) ++{ ++ if (*plevel == level) ++ return false; ++ ++ cpumask_clear_cpu(cpu, cpumasks + *plevel); ++ if (cpumask_empty(cpumasks + *plevel)) ++ clear_bit(*plevel, bitmap); ++ cpumask_set_cpu(cpu, cpumasks + level); ++ set_bit(level, bitmap); ++ ++ *plevel = level; ++ ++ return true; ++} ++ ++static inline int ++task_running_policy_level(const struct task_struct *p, const struct rq *rq) ++{ ++ int prio = p->prio; ++ ++ if (NORMAL_PRIO == prio) ++ return SCHED_RQ_NORMAL_0 + task_deadline_level(p, rq); ++ ++ if (ISO_PRIO == prio) ++ return SCHED_RQ_ISO; ++ if (prio < MAX_RT_PRIO) ++ return SCHED_RQ_RT; ++ return PRIO_LIMIT - prio; ++} ++ ++static inline void update_sched_rq_queued_masks_normal(struct rq *rq) ++{ ++ struct task_struct *p = rq_first_queued_task(rq); ++ ++ if (p->prio != NORMAL_PRIO) ++ return; ++ ++ __update_cpumasks_bitmap(cpu_of(rq), &rq->queued_level, ++ task_running_policy_level(p, rq), ++ &sched_rq_queued_masks[0], ++ &sched_rq_queued_masks_bitmap[0]); ++} ++ ++#ifdef CONFIG_SMT_NICE ++static inline void update_sched_cpu_psg_mask(const int cpu) ++{ ++ cpumask_t tmp; ++ ++ cpumask_or(&tmp, &sched_rq_queued_masks[SCHED_RQ_EMPTY], ++ &sched_rq_queued_masks[SCHED_RQ_IDLE]); ++ cpumask_and(&tmp, &tmp, cpu_smt_mask(cpu)); ++ if (cpumask_equal(&tmp, cpu_smt_mask(cpu))) ++ cpumask_or(&sched_cpu_psg_mask, &sched_cpu_psg_mask, ++ cpu_smt_mask(cpu)); ++ else ++ cpumask_andnot(&sched_cpu_psg_mask, &sched_cpu_psg_mask, ++ cpu_smt_mask(cpu)); ++} ++#endif ++ ++static inline void update_sched_rq_queued_masks(struct rq *rq) ++{ ++ int cpu = cpu_of(rq); ++ struct task_struct *p = rq_first_queued_task(rq); ++ unsigned long level; ++#ifdef CONFIG_SCHED_SMT ++ unsigned long last_level = rq->queued_level; ++#endif ++ ++ level = task_running_policy_level(p, rq); ++ sched_rq_prio[cpu] = p->prio; ++ ++ if (!__update_cpumasks_bitmap(cpu, &rq->queued_level, level, ++ &sched_rq_queued_masks[0], ++ &sched_rq_queued_masks_bitmap[0])) ++ return; ++ ++#ifdef CONFIG_SCHED_SMT ++ if (cpu == per_cpu(sched_sibling_cpu, cpu)) ++ return; ++ ++ if (SCHED_RQ_EMPTY == last_level) { ++ cpumask_andnot(&sched_cpu_sg_idle_mask, &sched_cpu_sg_idle_mask, ++ cpu_smt_mask(cpu)); ++ } else if (SCHED_RQ_EMPTY == level) { ++ cpumask_t tmp; ++ ++ cpumask_and(&tmp, cpu_smt_mask(cpu), ++ &sched_rq_queued_masks[SCHED_RQ_EMPTY]); ++ if (cpumask_equal(&tmp, cpu_smt_mask(cpu))) ++ cpumask_or(&sched_cpu_sg_idle_mask, cpu_smt_mask(cpu), ++ &sched_cpu_sg_idle_mask); ++ } ++ ++#ifdef CONFIG_SMT_NICE ++ if (level <= SCHED_RQ_IDLE && last_level > SCHED_RQ_IDLE) { ++ cpumask_clear_cpu(per_cpu(sched_sibling_cpu, cpu), ++ &sched_smt_supressed_mask); ++ update_sched_cpu_psg_mask(cpu); ++ resched_cpu_if_curr_is(per_cpu(sched_sibling_cpu, cpu), PRIO_LIMIT); ++ } else if (last_level <= SCHED_RQ_IDLE && level > SCHED_RQ_IDLE) { ++ cpumask_set_cpu(per_cpu(sched_sibling_cpu, cpu), ++ &sched_smt_supressed_mask); ++ update_sched_cpu_psg_mask(cpu); ++ resched_cpu_if_curr_is(per_cpu(sched_sibling_cpu, cpu), IDLE_PRIO); ++ } ++#endif /* CONFIG_SMT_NICE */ ++#endif ++} ++ ++static inline void update_sched_rq_pending_masks(struct rq *rq) ++{ ++ unsigned long level; ++ struct task_struct *p = rq_second_queued_task(rq); ++ ++ level = task_running_policy_level(p, rq); ++ ++ __update_cpumasks_bitmap(cpu_of(rq), &rq->pending_level, level, ++ &sched_rq_pending_masks[0], ++ &sched_rq_pending_masks_bitmap[0]); ++} ++ ++#else /* CONFIG_SMP */ ++static inline void update_sched_rq_queued_masks(struct rq *rq) {} ++static inline void update_sched_rq_queued_masks_normal(struct rq *rq) {} ++static inline void update_sched_rq_pending_masks(struct rq *rq) {} ++#endif ++ ++#ifdef CONFIG_NO_HZ_FULL ++/* ++ * Tick may be needed by tasks in the runqueue depending on their policy and ++ * requirements. If tick is needed, lets send the target an IPI to kick it out ++ * of nohz mode if necessary. ++ */ ++static inline void sched_update_tick_dependency(struct rq *rq) ++{ ++ int cpu; ++ ++ if (!tick_nohz_full_enabled()) ++ return; ++ ++ cpu = cpu_of(rq); ++ ++ if (!tick_nohz_full_cpu(cpu)) ++ return; ++ ++ if (rq->nr_running < 2) ++ tick_nohz_dep_clear_cpu(cpu, TICK_DEP_BIT_SCHED); ++ else ++ tick_nohz_dep_set_cpu(cpu, TICK_DEP_BIT_SCHED); ++} ++#else /* !CONFIG_NO_HZ_FULL */ ++static inline void sched_update_tick_dependency(struct rq *rq) { } ++#endif ++ ++/* ++ * Removing from the runqueue. Deleting a task from the skip list is done ++ * via the stored node reference in the task struct and does not require a full ++ * look up. Thus it occurs in O(k) time where k is the "level" of the list the ++ * task was stored at - usually < 4, max 16. ++ * ++ * Context: rq->lock ++ */ ++static inline void dequeue_task(struct task_struct *p, struct rq *rq, int flags) ++{ ++ lockdep_assert_held(&rq->lock); ++ ++ WARN_ONCE(task_rq(p) != rq, "pds: dequeue task reside on cpu%d from cpu%d\n", ++ task_cpu(p), cpu_of(rq)); ++ if (skiplist_del_init(&rq->sl_header, &p->sl_node)) { ++ update_sched_rq_queued_masks(rq); ++ update_sched_rq_pending_masks(rq); ++ } else if (is_second_in_rq(p, rq)) ++ update_sched_rq_pending_masks(rq); ++ rq->nr_running--; ++ ++ sched_update_tick_dependency(rq); ++ psi_dequeue(p, flags & DEQUEUE_SLEEP); ++ ++ sched_info_dequeued(rq, p); ++} ++ ++/* ++ * To determine if it's safe for a task of SCHED_IDLE to actually run as ++ * an idle task, we ensure none of the following conditions are met. ++ */ ++static inline bool idleprio_suitable(struct task_struct *p) ++{ ++ return (!freezing(p) && !signal_pending(p) && ++ !(task_contributes_to_load(p)) && !(p->flags & (PF_EXITING))); ++} ++ ++/* ++ * pds_skiplist_random_level -- Returns a pseudo-random level number for skip ++ * list node which is used in PDS run queue. ++ * ++ * In current implementation, based on testing, the first 8 bits in microseconds ++ * of niffies are suitable for random level population. ++ * find_first_bit() is used to satisfy p = 0.5 between each levels, and there ++ * should be platform hardware supported instruction(known as ctz/clz) to speed ++ * up this function. ++ * The skiplist level for a task is populated when task is created and doesn't ++ * change in task's life time. When task is being inserted into run queue, this ++ * skiplist level is set to task's sl_node->level, the skiplist insert function ++ * may change it based on current level of the skip lsit. ++ */ ++static inline int pds_skiplist_random_level(const struct task_struct *p) ++{ ++ long unsigned int randseed; ++ ++ /* ++ * 1. Some architectures don't have better than microsecond resolution ++ * so mask out ~microseconds as a factor of the random seed for skiplist ++ * insertion. ++ * 2. Use address of task structure pointer as another factor of the ++ * random seed for task burst forking scenario. ++ */ ++ randseed = (task_rq(p)->clock ^ (long unsigned int)p) >> 10; ++ ++ return find_first_bit(&randseed, NUM_SKIPLIST_LEVEL - 1); ++} ++ ++/** ++ * pds_skiplist_task_search -- search function used in PDS run queue skip list ++ * node insert operation. ++ * @it: iterator pointer to the node in the skip list ++ * @node: pointer to the skiplist_node to be inserted ++ * ++ * Returns true if key of @it is less or equal to key value of @node, otherwise ++ * false. ++ */ ++static inline bool ++pds_skiplist_task_search(struct skiplist_node *it, struct skiplist_node *node) ++{ ++ return (skiplist_entry(it, struct task_struct, sl_node)->priodl <= ++ skiplist_entry(node, struct task_struct, sl_node)->priodl); ++} ++ ++/* ++ * Define the skip list insert function for PDS ++ */ ++DEFINE_SKIPLIST_INSERT_FUNC(pds_skiplist_insert, pds_skiplist_task_search); ++ ++/* ++ * Adding task to the runqueue. ++ * ++ * Context: rq->lock ++ */ ++static inline void enqueue_task(struct task_struct *p, struct rq *rq, int flags) ++{ ++ lockdep_assert_held(&rq->lock); ++ ++ WARN_ONCE(task_rq(p) != rq, "pds: enqueue task reside on cpu%d to cpu%d\n", ++ task_cpu(p), cpu_of(rq)); ++ ++ p->sl_node.level = p->sl_level; ++ if (pds_skiplist_insert(&rq->sl_header, &p->sl_node)) { ++ update_sched_rq_queued_masks(rq); ++ update_sched_rq_pending_masks(rq); ++ } else if (is_second_in_rq(p, rq)) ++ update_sched_rq_pending_masks(rq); ++ rq->nr_running++; ++ ++ sched_update_tick_dependency(rq); ++ ++ sched_info_queued(rq, p); ++ psi_enqueue(p, flags); ++ ++ /* ++ * If in_iowait is set, the code below may not trigger any cpufreq ++ * utilization updates, so do it here explicitly with the IOWAIT flag ++ * passed. ++ */ ++ if (p->in_iowait) ++ cpufreq_update_this_cpu(rq, SCHED_CPUFREQ_IOWAIT); ++} ++ ++static inline void requeue_task(struct task_struct *p, struct rq *rq) ++{ ++ bool b_first, b_second; ++ ++ lockdep_assert_held(&rq->lock); ++ ++ WARN_ONCE(task_rq(p) != rq, "pds: cpu[%d] requeue task reside on cpu%d\n", ++ cpu_of(rq), task_cpu(p)); ++ ++ b_first = skiplist_del_init(&rq->sl_header, &p->sl_node); ++ b_second = is_second_in_rq(p, rq); ++ ++ p->sl_node.level = p->sl_level; ++ if (pds_skiplist_insert(&rq->sl_header, &p->sl_node) || b_first) { ++ update_sched_rq_queued_masks(rq); ++ update_sched_rq_pending_masks(rq); ++ } else if (is_second_in_rq(p, rq) || b_second) ++ update_sched_rq_pending_masks(rq); ++} ++ ++/* ++ * resched_curr - mark rq's current task 'to be rescheduled now'. ++ * ++ * On UP this means the setting of the need_resched flag, on SMP it ++ * might also involve a cross-CPU call to trigger the scheduler on ++ * the target CPU. ++ */ ++void resched_curr(struct rq *rq) ++{ ++ struct task_struct *curr = rq->curr; ++ int cpu; ++ ++ lockdep_assert_held(&rq->lock); ++ ++ if (test_tsk_need_resched(curr)) ++ return; ++ ++ cpu = cpu_of(rq); ++ if (cpu == smp_processor_id()) { ++ set_tsk_need_resched(curr); ++ set_preempt_need_resched(); ++ return; ++ } ++ ++ if (set_nr_and_not_polling(curr)) ++ smp_send_reschedule(cpu); ++ else ++ trace_sched_wake_idle_without_ipi(cpu); ++} ++ ++static inline void check_preempt_curr(struct rq *rq, struct task_struct *p) ++{ ++ struct task_struct *curr = rq->curr; ++ ++ if (curr->prio == PRIO_LIMIT) ++ resched_curr(rq); ++ ++ if (task_running_idle(p)) ++ return; ++ ++ if (p->priodl < curr->priodl) ++ resched_curr(rq); ++} ++ ++#ifdef CONFIG_SCHED_HRTICK ++/* ++ * Use HR-timers to deliver accurate preemption points. ++ */ ++ ++static void hrtick_clear(struct rq *rq) ++{ ++ if (hrtimer_active(&rq->hrtick_timer)) ++ hrtimer_cancel(&rq->hrtick_timer); ++} ++ ++/* ++ * High-resolution timer tick. ++ * Runs from hardirq context with interrupts disabled. ++ */ ++static enum hrtimer_restart hrtick(struct hrtimer *timer) ++{ ++ struct rq *rq = container_of(timer, struct rq, hrtick_timer); ++ struct task_struct *p; ++ ++ WARN_ON_ONCE(cpu_of(rq) != smp_processor_id()); ++ ++ raw_spin_lock(&rq->lock); ++ p = rq->curr; ++ p->time_slice = 0; ++ resched_curr(rq); ++ raw_spin_unlock(&rq->lock); ++ ++ return HRTIMER_NORESTART; ++} ++ ++/* ++ * Use hrtick when: ++ * - enabled by features ++ * - hrtimer is actually high res ++ */ ++static inline int hrtick_enabled(struct rq *rq) ++{ ++ /** ++ * PDS doesn't support sched_feat yet ++ if (!sched_feat(HRTICK)) ++ return 0; ++ */ ++ if (!cpu_active(cpu_of(rq))) ++ return 0; ++ return hrtimer_is_hres_active(&rq->hrtick_timer); ++} ++ ++#ifdef CONFIG_SMP ++ ++static void __hrtick_restart(struct rq *rq) ++{ ++ struct hrtimer *timer = &rq->hrtick_timer; ++ ++ hrtimer_start_expires(timer, HRTIMER_MODE_ABS_PINNED_HARD); ++} ++ ++/* ++ * called from hardirq (IPI) context ++ */ ++static void __hrtick_start(void *arg) ++{ ++ struct rq *rq = arg; ++ ++ raw_spin_lock(&rq->lock); ++ __hrtick_restart(rq); ++ rq->hrtick_csd_pending = 0; ++ raw_spin_unlock(&rq->lock); ++} ++ ++/* ++ * Called to set the hrtick timer state. ++ * ++ * called with rq->lock held and irqs disabled ++ */ ++void hrtick_start(struct rq *rq, u64 delay) ++{ ++ struct hrtimer *timer = &rq->hrtick_timer; ++ ktime_t time; ++ s64 delta; ++ ++ /* ++ * Don't schedule slices shorter than 10000ns, that just ++ * doesn't make sense and can cause timer DoS. ++ */ ++ delta = max_t(s64, delay, 10000LL); ++ time = ktime_add_ns(timer->base->get_time(), delta); ++ ++ hrtimer_set_expires(timer, time); ++ ++ if (rq == this_rq()) { ++ __hrtick_restart(rq); ++ } else if (!rq->hrtick_csd_pending) { ++ smp_call_function_single_async(cpu_of(rq), &rq->hrtick_csd); ++ rq->hrtick_csd_pending = 1; ++ } ++} ++ ++#else ++/* ++ * Called to set the hrtick timer state. ++ * ++ * called with rq->lock held and irqs disabled ++ */ ++void hrtick_start(struct rq *rq, u64 delay) ++{ ++ /* ++ * Don't schedule slices shorter than 10000ns, that just ++ * doesn't make sense. Rely on vruntime for fairness. ++ */ ++ delay = max_t(u64, delay, 10000LL); ++ hrtimer_start(&rq->hrtick_timer, ns_to_ktime(delay), ++ HRTIMER_MODE_REL_PINNED_HARD); ++} ++#endif /* CONFIG_SMP */ ++ ++static void hrtick_rq_init(struct rq *rq) ++{ ++#ifdef CONFIG_SMP ++ rq->hrtick_csd_pending = 0; ++ ++ rq->hrtick_csd.flags = 0; ++ rq->hrtick_csd.func = __hrtick_start; ++ rq->hrtick_csd.info = rq; ++#endif ++ ++ hrtimer_init(&rq->hrtick_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL_HARD); ++ rq->hrtick_timer.function = hrtick; ++} ++ ++static inline int rq_dither(struct rq *rq) ++{ ++ if ((rq->clock - rq->last_tick > HALF_JIFFY_NS) || hrtick_enabled(rq)) ++ return 0; ++ ++ return HALF_JIFFY_NS; ++} ++ ++#else /* CONFIG_SCHED_HRTICK */ ++static inline int hrtick_enabled(struct rq *rq) ++{ ++ return 0; ++} ++ ++static inline void hrtick_clear(struct rq *rq) ++{ ++} ++ ++static inline void hrtick_rq_init(struct rq *rq) ++{ ++} ++ ++static inline int rq_dither(struct rq *rq) ++{ ++ return (rq->clock - rq->last_tick > HALF_JIFFY_NS)? 0:HALF_JIFFY_NS; ++} ++#endif /* CONFIG_SCHED_HRTICK */ ++ ++static inline int normal_prio(struct task_struct *p) ++{ ++ static const int policy_to_prio[] = { ++ NORMAL_PRIO, /* SCHED_NORMAL */ ++ 0, /* SCHED_FIFO */ ++ 0, /* SCHED_RR */ ++ IDLE_PRIO, /* SCHED_BATCH */ ++ ISO_PRIO, /* SCHED_ISO */ ++ IDLE_PRIO /* SCHED_IDLE */ ++ }; ++ ++ if (task_has_rt_policy(p)) ++ return MAX_RT_PRIO - 1 - p->rt_priority; ++ return policy_to_prio[p->policy]; ++} ++ ++/* ++ * Calculate the current priority, i.e. the priority ++ * taken into account by the scheduler. This value might ++ * be boosted by RT tasks as it will be RT if the task got ++ * RT-boosted. If not then it returns p->normal_prio. ++ */ ++static int effective_prio(struct task_struct *p) ++{ ++ p->normal_prio = normal_prio(p); ++ /* ++ * If we are RT tasks or we were boosted to RT priority, ++ * keep the priority unchanged. Otherwise, update priority ++ * to the normal priority: ++ */ ++ if (!rt_prio(p->prio)) ++ return p->normal_prio; ++ return p->prio; ++} ++ ++/* ++ * activate_task - move a task to the runqueue. ++ * ++ * Context: rq->lock ++ */ ++static void activate_task(struct task_struct *p, struct rq *rq) ++{ ++ if (task_contributes_to_load(p)) ++ rq->nr_uninterruptible--; ++ enqueue_task(p, rq, ENQUEUE_WAKEUP); ++ p->on_rq = 1; ++ cpufreq_update_this_cpu(rq, 0); ++} ++ ++/* ++ * deactivate_task - remove a task from the runqueue. ++ * ++ * Context: rq->lock ++ */ ++static inline void deactivate_task(struct task_struct *p, struct rq *rq) ++{ ++ if (task_contributes_to_load(p)) ++ rq->nr_uninterruptible++; ++ dequeue_task(p, rq, DEQUEUE_SLEEP); ++ p->on_rq = 0; ++ cpufreq_update_this_cpu(rq, 0); ++} ++ ++static inline void __set_task_cpu(struct task_struct *p, unsigned int cpu) ++{ ++#ifdef CONFIG_SMP ++ /* ++ * After ->cpu is set up to a new value, task_access_lock(p, ...) can be ++ * successfully executed on another CPU. We must ensure that updates of ++ * per-task data have been completed by this moment. ++ */ ++ smp_wmb(); ++ ++#ifdef CONFIG_THREAD_INFO_IN_TASK ++ WRITE_ONCE(p->cpu, cpu); ++#else ++ WRITE_ONCE(task_thread_info(p)->cpu, cpu); ++#endif ++#endif ++} ++ ++#ifdef CONFIG_SMP ++void set_task_cpu(struct task_struct *p, unsigned int new_cpu) ++{ ++#ifdef CONFIG_SCHED_DEBUG ++ /* ++ * We should never call set_task_cpu() on a blocked task, ++ * ttwu() will sort out the placement. ++ */ ++ WARN_ON_ONCE(p->state != TASK_RUNNING && p->state != TASK_WAKING && ++ !p->on_rq); ++#ifdef CONFIG_LOCKDEP ++ /* ++ * The caller should hold either p->pi_lock or rq->lock, when changing ++ * a task's CPU. ->pi_lock for waking tasks, rq->lock for runnable tasks. ++ * ++ * sched_move_task() holds both and thus holding either pins the cgroup, ++ * see task_group(). ++ */ ++ WARN_ON_ONCE(debug_locks && !(lockdep_is_held(&p->pi_lock) || ++ lockdep_is_held(&task_rq(p)->lock))); ++#endif ++ /* ++ * Clearly, migrating tasks to offline CPUs is a fairly daft thing. ++ */ ++ WARN_ON_ONCE(!cpu_online(new_cpu)); ++#endif ++ if (task_cpu(p) == new_cpu) ++ return; ++ trace_sched_migrate_task(p, new_cpu); ++ rseq_migrate(p); ++ perf_event_task_migrate(p); ++ ++ __set_task_cpu(p, new_cpu); ++} ++ ++static inline bool is_per_cpu_kthread(struct task_struct *p) ++{ ++ return ((p->flags & PF_KTHREAD) && (1 == p->nr_cpus_allowed)); ++} ++ ++/* ++ * Per-CPU kthreads are allowed to run on !active && online CPUs, see ++ * __set_cpus_allowed_ptr() and select_fallback_rq(). ++ */ ++static inline bool is_cpu_allowed(struct task_struct *p, int cpu) ++{ ++ if (!cpumask_test_cpu(cpu, &p->cpus_mask)) ++ return false; ++ ++ if (is_per_cpu_kthread(p)) ++ return cpu_online(cpu); ++ ++ return cpu_active(cpu); ++} ++ ++/* ++ * This is how migration works: ++ * ++ * 1) we invoke migration_cpu_stop() on the target CPU using ++ * stop_one_cpu(). ++ * 2) stopper starts to run (implicitly forcing the migrated thread ++ * off the CPU) ++ * 3) it checks whether the migrated task is still in the wrong runqueue. ++ * 4) if it's in the wrong runqueue then the migration thread removes ++ * it and puts it into the right queue. ++ * 5) stopper completes and stop_one_cpu() returns and the migration ++ * is done. ++ */ ++ ++/* ++ * detach_task() -- detach the task for the migration specified in @target_cpu ++ */ ++static void detach_task(struct rq *rq, struct task_struct *p, int target_cpu) ++{ ++ lockdep_assert_held(&rq->lock); ++ ++ WRITE_ONCE(p->on_rq ,TASK_ON_RQ_MIGRATING); ++ if (task_contributes_to_load(p)) ++ rq->nr_uninterruptible++; ++ dequeue_task(p, rq, 0); ++ ++ set_task_cpu(p, target_cpu); ++} ++ ++/* ++ * attach_task() -- attach the task detached by detach_task() to its new rq. ++ */ ++static void attach_task(struct rq *rq, struct task_struct *p) ++{ ++ lockdep_assert_held(&rq->lock); ++ ++ BUG_ON(task_rq(p) != rq); ++ ++ if (task_contributes_to_load(p)) ++ rq->nr_uninterruptible--; ++ enqueue_task(p, rq, 0); ++ p->on_rq = TASK_ON_RQ_QUEUED; ++ cpufreq_update_this_cpu(rq, 0); ++} ++ ++/* ++ * move_queued_task - move a queued task to new rq. ++ * ++ * Returns (locked) new rq. Old rq's lock is released. ++ */ ++static struct rq *move_queued_task(struct rq *rq, struct task_struct *p, int ++ new_cpu) ++{ ++ detach_task(rq, p, new_cpu); ++ raw_spin_unlock(&rq->lock); ++ ++ rq = cpu_rq(new_cpu); ++ ++ raw_spin_lock(&rq->lock); ++ update_rq_clock(rq); ++ ++ attach_task(rq, p); ++ ++ check_preempt_curr(rq, p); ++ ++ return rq; ++} ++ ++struct migration_arg { ++ struct task_struct *task; ++ int dest_cpu; ++}; ++ ++/* ++ * Move (not current) task off this CPU, onto the destination CPU. We're doing ++ * this because either it can't run here any more (set_cpus_allowed() ++ * away from this CPU, or CPU going down), or because we're ++ * attempting to rebalance this task on exec (sched_exec). ++ * ++ * So we race with normal scheduler movements, but that's OK, as long ++ * as the task is no longer on this CPU. ++ */ ++static struct rq *__migrate_task(struct rq *rq, struct task_struct *p, int ++ dest_cpu) ++{ ++ /* Affinity changed (again). */ ++ if (!is_cpu_allowed(p, dest_cpu)) ++ return rq; ++ ++ update_rq_clock(rq); ++ return move_queued_task(rq, p, dest_cpu); ++} ++ ++/* ++ * migration_cpu_stop - this will be executed by a highprio stopper thread ++ * and performs thread migration by bumping thread off CPU then ++ * 'pushing' onto another runqueue. ++ */ ++static int migration_cpu_stop(void *data) ++{ ++ struct migration_arg *arg = data; ++ struct task_struct *p = arg->task; ++ struct rq *rq = this_rq(); ++ ++ /* ++ * The original target CPU might have gone down and we might ++ * be on another CPU but it doesn't matter. ++ */ ++ local_irq_disable(); ++ ++ raw_spin_lock(&p->pi_lock); ++ raw_spin_lock(&rq->lock); ++ /* ++ * If task_rq(p) != rq, it cannot be migrated here, because we're ++ * holding rq->lock, if p->on_rq == 0 it cannot get enqueued because ++ * we're holding p->pi_lock. ++ */ ++ if (task_rq(p) == rq) ++ if (task_on_rq_queued(p)) ++ rq = __migrate_task(rq, p, arg->dest_cpu); ++ raw_spin_unlock(&rq->lock); ++ raw_spin_unlock(&p->pi_lock); ++ ++ local_irq_enable(); ++ return 0; ++} ++ ++static inline void ++set_cpus_allowed_common(struct task_struct *p, const struct cpumask *new_mask) ++{ ++ cpumask_copy(&p->cpus_mask, new_mask); ++ p->nr_cpus_allowed = cpumask_weight(new_mask); ++} ++ ++void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask) ++{ ++ set_cpus_allowed_common(p, new_mask); ++} ++#endif ++ ++/* Enter with rq lock held. We know p is on the local CPU */ ++static inline void __set_tsk_resched(struct task_struct *p) ++{ ++ set_tsk_need_resched(p); ++ set_preempt_need_resched(); ++} ++ ++/** ++ * task_curr - is this task currently executing on a CPU? ++ * @p: the task in question. ++ * ++ * Return: 1 if the task is currently executing. 0 otherwise. ++ */ ++inline int task_curr(const struct task_struct *p) ++{ ++ return cpu_curr(task_cpu(p)) == p; ++} ++ ++#ifdef CONFIG_SMP ++/* ++ * wait_task_inactive - wait for a thread to unschedule. ++ * ++ * If @match_state is nonzero, it's the @p->state value just checked and ++ * not expected to change. If it changes, i.e. @p might have woken up, ++ * then return zero. When we succeed in waiting for @p to be off its CPU, ++ * we return a positive number (its total switch count). If a second call ++ * a short while later returns the same number, the caller can be sure that ++ * @p has remained unscheduled the whole time. ++ * ++ * The caller must ensure that the task *will* unschedule sometime soon, ++ * else this function might spin for a *long* time. This function can't ++ * be called with interrupts off, or it may introduce deadlock with ++ * smp_call_function() if an IPI is sent by the same process we are ++ * waiting to become inactive. ++ */ ++unsigned long wait_task_inactive(struct task_struct *p, long match_state) ++{ ++ unsigned long flags; ++ bool running, on_rq; ++ unsigned long ncsw; ++ struct rq *rq; ++ raw_spinlock_t *lock; ++ ++ for (;;) { ++ rq = task_rq(p); ++ ++ /* ++ * If the task is actively running on another CPU ++ * still, just relax and busy-wait without holding ++ * any locks. ++ * ++ * NOTE! Since we don't hold any locks, it's not ++ * even sure that "rq" stays as the right runqueue! ++ * But we don't care, since this will return false ++ * if the runqueue has changed and p is actually now ++ * running somewhere else! ++ */ ++ while (task_running(p) && p == rq->curr) { ++ if (match_state && unlikely(p->state != match_state)) ++ return 0; ++ cpu_relax(); ++ } ++ ++ /* ++ * Ok, time to look more closely! We need the rq ++ * lock now, to be *sure*. If we're wrong, we'll ++ * just go back and repeat. ++ */ ++ task_access_lock_irqsave(p, &lock, &flags); ++ trace_sched_wait_task(p); ++ running = task_running(p); ++ on_rq = p->on_rq; ++ ncsw = 0; ++ if (!match_state || p->state == match_state) ++ ncsw = p->nvcsw | LONG_MIN; /* sets MSB */ ++ task_access_unlock_irqrestore(p, lock, &flags); ++ ++ /* ++ * If it changed from the expected state, bail out now. ++ */ ++ if (unlikely(!ncsw)) ++ break; ++ ++ /* ++ * Was it really running after all now that we ++ * checked with the proper locks actually held? ++ * ++ * Oops. Go back and try again.. ++ */ ++ if (unlikely(running)) { ++ cpu_relax(); ++ continue; ++ } ++ ++ /* ++ * It's not enough that it's not actively running, ++ * it must be off the runqueue _entirely_, and not ++ * preempted! ++ * ++ * So if it was still runnable (but just not actively ++ * running right now), it's preempted, and we should ++ * yield - it could be a while. ++ */ ++ if (unlikely(on_rq)) { ++ ktime_t to = NSEC_PER_SEC / HZ; ++ ++ set_current_state(TASK_UNINTERRUPTIBLE); ++ schedule_hrtimeout(&to, HRTIMER_MODE_REL); ++ continue; ++ } ++ ++ /* ++ * Ahh, all good. It wasn't running, and it wasn't ++ * runnable, which means that it will never become ++ * running in the future either. We're all done! ++ */ ++ break; ++ } ++ ++ return ncsw; ++} ++ ++/*** ++ * kick_process - kick a running thread to enter/exit the kernel ++ * @p: the to-be-kicked thread ++ * ++ * Cause a process which is running on another CPU to enter ++ * kernel-mode, without any delay. (to get signals handled.) ++ * ++ * NOTE: this function doesn't have to take the runqueue lock, ++ * because all it wants to ensure is that the remote task enters ++ * the kernel. If the IPI races and the task has been migrated ++ * to another CPU then no harm is done and the purpose has been ++ * achieved as well. ++ */ ++void kick_process(struct task_struct *p) ++{ ++ int cpu; ++ ++ preempt_disable(); ++ cpu = task_cpu(p); ++ if ((cpu != smp_processor_id()) && task_curr(p)) ++ smp_send_reschedule(cpu); ++ preempt_enable(); ++} ++EXPORT_SYMBOL_GPL(kick_process); ++ ++/* ++ * ->cpus_mask is protected by both rq->lock and p->pi_lock ++ * ++ * A few notes on cpu_active vs cpu_online: ++ * ++ * - cpu_active must be a subset of cpu_online ++ * ++ * - on CPU-up we allow per-CPU kthreads on the online && !active CPU, ++ * see __set_cpus_allowed_ptr(). At this point the newly online ++ * CPU isn't yet part of the sched domains, and balancing will not ++ * see it. ++ * ++ * - on cpu-down we clear cpu_active() to mask the sched domains and ++ * avoid the load balancer to place new tasks on the to be removed ++ * CPU. Existing tasks will remain running there and will be taken ++ * off. ++ * ++ * This means that fallback selection must not select !active CPUs. ++ * And can assume that any active CPU must be online. Conversely ++ * select_task_rq() below may allow selection of !active CPUs in order ++ * to satisfy the above rules. ++ */ ++static int select_fallback_rq(int cpu, struct task_struct *p) ++{ ++ int nid = cpu_to_node(cpu); ++ const struct cpumask *nodemask = NULL; ++ enum { cpuset, possible, fail } state = cpuset; ++ int dest_cpu; ++ ++ /* ++ * If the node that the CPU is on has been offlined, cpu_to_node() ++ * will return -1. There is no CPU on the node, and we should ++ * select the CPU on the other node. ++ */ ++ if (nid != -1) { ++ nodemask = cpumask_of_node(nid); ++ ++ /* Look for allowed, online CPU in same node. */ ++ for_each_cpu(dest_cpu, nodemask) { ++ if (!cpu_active(dest_cpu)) ++ continue; ++ if (cpumask_test_cpu(dest_cpu, &p->cpus_mask)) ++ return dest_cpu; ++ } ++ } ++ ++ for (;;) { ++ /* Any allowed, online CPU? */ ++ for_each_cpu(dest_cpu, &p->cpus_mask) { ++ if (!is_cpu_allowed(p, dest_cpu)) ++ continue; ++ goto out; ++ } ++ ++ /* No more Mr. Nice Guy. */ ++ switch (state) { ++ case cpuset: ++ if (IS_ENABLED(CONFIG_CPUSETS)) { ++ cpuset_cpus_allowed_fallback(p); ++ state = possible; ++ break; ++ } ++ /* Fall-through */ ++ case possible: ++ do_set_cpus_allowed(p, cpu_possible_mask); ++ state = fail; ++ break; ++ ++ case fail: ++ BUG(); ++ break; ++ } ++ } ++ ++out: ++ if (state != cpuset) { ++ /* ++ * Don't tell them about moving exiting tasks or ++ * kernel threads (both mm NULL), since they never ++ * leave kernel. ++ */ ++ if (p->mm && printk_ratelimit()) { ++ printk_deferred("process %d (%s) no longer affine to cpu%d\n", ++ task_pid_nr(p), p->comm, cpu); ++ } ++ } ++ ++ return dest_cpu; ++} ++ ++static inline int best_mask_cpu(int cpu, const cpumask_t *cpumask) ++{ ++ cpumask_t *mask; ++ ++ if (cpumask_test_cpu(cpu, cpumask)) ++ return cpu; ++ ++ mask = &(per_cpu(sched_cpu_affinity_chk_masks, cpu)[0]); ++ while ((cpu = cpumask_any_and(cpumask, mask)) >= nr_cpu_ids) ++ mask++; ++ ++ return cpu; ++} ++ ++/* ++ * task_preemptible_rq - return the rq which the given task can preempt on ++ * @p: task wants to preempt CPU ++ * @only_preempt_low_policy: indicate only preempt rq running low policy than @p ++ */ ++static inline int ++task_preemptible_rq_idle(struct task_struct *p, cpumask_t *chk_mask) ++{ ++ cpumask_t tmp; ++ ++#ifdef CONFIG_SCHED_SMT ++ if (cpumask_and(&tmp, chk_mask, &sched_cpu_sg_idle_mask)) ++ return best_mask_cpu(task_cpu(p), &tmp); ++#endif ++ ++#ifdef CONFIG_SMT_NICE ++ /* Only ttwu on cpu which is not smt supressed */ ++ if (cpumask_andnot(&tmp, chk_mask, &sched_smt_supressed_mask)) { ++ cpumask_t t; ++ if (cpumask_and(&t, &tmp, &sched_rq_queued_masks[SCHED_RQ_EMPTY])) ++ return best_mask_cpu(task_cpu(p), &t); ++ return best_mask_cpu(task_cpu(p), &tmp); ++ } ++#endif ++ ++ if (cpumask_and(&tmp, chk_mask, &sched_rq_queued_masks[SCHED_RQ_EMPTY])) ++ return best_mask_cpu(task_cpu(p), &tmp); ++ return best_mask_cpu(task_cpu(p), chk_mask); ++} ++ ++static inline int ++task_preemptible_rq(struct task_struct *p, cpumask_t *chk_mask, ++ int preempt_level) ++{ ++ cpumask_t tmp; ++ int level; ++ ++#ifdef CONFIG_SCHED_SMT ++#ifdef CONFIG_SMT_NICE ++ if (cpumask_and(&tmp, chk_mask, &sched_cpu_psg_mask)) ++ return best_mask_cpu(task_cpu(p), &tmp); ++#else ++ if (cpumask_and(&tmp, chk_mask, &sched_cpu_sg_idle_mask)) ++ return best_mask_cpu(task_cpu(p), &tmp); ++#endif ++#endif ++ ++ level = find_first_bit(sched_rq_queued_masks_bitmap, ++ NR_SCHED_RQ_QUEUED_LEVEL); ++ ++ while (level < preempt_level) { ++ if (cpumask_and(&tmp, chk_mask, &sched_rq_queued_masks[level])) ++ return best_mask_cpu(task_cpu(p), &tmp); ++ ++ level = find_next_bit(sched_rq_queued_masks_bitmap, ++ NR_SCHED_RQ_QUEUED_LEVEL, ++ level + 1); ++ } ++ ++ if (unlikely(SCHED_RQ_RT == level && ++ level == preempt_level && ++ cpumask_and(&tmp, chk_mask, ++ &sched_rq_queued_masks[SCHED_RQ_RT]))) { ++ unsigned int cpu; ++ ++ for_each_cpu (cpu, &tmp) ++ if (p->prio < sched_rq_prio[cpu]) ++ return cpu; ++ } ++ ++ return best_mask_cpu(task_cpu(p), chk_mask); ++} ++ ++/* ++ * wake flags ++ */ ++#define WF_SYNC 0x01 /* waker goes to sleep after wakeup */ ++#define WF_FORK 0x02 /* child wakeup after fork */ ++#define WF_MIGRATED 0x04 /* internal use, task got migrated */ ++ ++static inline int select_task_rq(struct task_struct *p) ++{ ++ cpumask_t chk_mask; ++ ++ if (unlikely(!cpumask_and(&chk_mask, &p->cpus_mask, cpu_online_mask))) ++ return select_fallback_rq(task_cpu(p), p); ++ ++ /* Check IDLE tasks suitable to run normal priority */ ++ if (idleprio_task(p)) { ++ if (idleprio_suitable(p)) { ++ p->prio = p->normal_prio; ++ update_task_priodl(p); ++ return task_preemptible_rq_idle(p, &chk_mask); ++ } ++ p->prio = NORMAL_PRIO; ++ update_task_priodl(p); ++ } ++ ++ return task_preemptible_rq(p, &chk_mask, ++ task_running_policy_level(p, this_rq())); ++} ++#else /* CONFIG_SMP */ ++static inline int select_task_rq(struct task_struct *p) ++{ ++ return 0; ++} ++#endif /* CONFIG_SMP */ ++ ++static void ++ttwu_stat(struct task_struct *p, int cpu, int wake_flags) ++{ ++ struct rq *rq; ++ ++ if (!schedstat_enabled()) ++ return; ++ ++ rq= this_rq(); ++ ++#ifdef CONFIG_SMP ++ if (cpu == rq->cpu) ++ __schedstat_inc(rq->ttwu_local); ++ else { ++ /** PDS ToDo: ++ * How to do ttwu_wake_remote ++ */ ++ } ++#endif /* CONFIG_SMP */ ++ ++ __schedstat_inc(rq->ttwu_count); ++} ++ ++/* ++ * Mark the task runnable and perform wakeup-preemption. ++ */ ++static inline void ++ttwu_do_wakeup(struct rq *rq, struct task_struct *p, int wake_flags) ++{ ++ p->state = TASK_RUNNING; ++ trace_sched_wakeup(p); ++} ++ ++static inline void ++ttwu_do_activate(struct rq *rq, struct task_struct *p, int wake_flags) ++{ ++#ifdef CONFIG_SMP ++ if (p->sched_contributes_to_load) ++ rq->nr_uninterruptible--; ++#endif ++ ++ activate_task(p, rq); ++ ttwu_do_wakeup(rq, p, 0); ++} ++ ++static int ttwu_remote(struct task_struct *p, int wake_flags) ++{ ++ struct rq *rq; ++ raw_spinlock_t *lock; ++ int ret = 0; ++ ++ rq = __task_access_lock(p, &lock); ++ if (task_on_rq_queued(p)) { ++ ttwu_do_wakeup(rq, p, wake_flags); ++ ret = 1; ++ } ++ __task_access_unlock(p, lock); ++ ++ return ret; ++} ++ ++/* ++ * Notes on Program-Order guarantees on SMP systems. ++ * ++ * MIGRATION ++ * ++ * The basic program-order guarantee on SMP systems is that when a task [t] ++ * migrates, all its activity on its old CPU [c0] happens-before any subsequent ++ * execution on its new CPU [c1]. ++ * ++ * For migration (of runnable tasks) this is provided by the following means: ++ * ++ * A) UNLOCK of the rq(c0)->lock scheduling out task t ++ * B) migration for t is required to synchronize *both* rq(c0)->lock and ++ * rq(c1)->lock (if not at the same time, then in that order). ++ * C) LOCK of the rq(c1)->lock scheduling in task ++ * ++ * Transitivity guarantees that B happens after A and C after B. ++ * Note: we only require RCpc transitivity. ++ * Note: the CPU doing B need not be c0 or c1 ++ * ++ * Example: ++ * ++ * CPU0 CPU1 CPU2 ++ * ++ * LOCK rq(0)->lock ++ * sched-out X ++ * sched-in Y ++ * UNLOCK rq(0)->lock ++ * ++ * LOCK rq(0)->lock // orders against CPU0 ++ * dequeue X ++ * UNLOCK rq(0)->lock ++ * ++ * LOCK rq(1)->lock ++ * enqueue X ++ * UNLOCK rq(1)->lock ++ * ++ * LOCK rq(1)->lock // orders against CPU2 ++ * sched-out Z ++ * sched-in X ++ * UNLOCK rq(1)->lock ++ * ++ * ++ * BLOCKING -- aka. SLEEP + WAKEUP ++ * ++ * For blocking we (obviously) need to provide the same guarantee as for ++ * migration. However the means are completely different as there is no lock ++ * chain to provide order. Instead we do: ++ * ++ * 1) smp_store_release(X->on_cpu, 0) ++ * 2) smp_cond_load_acquire(!X->on_cpu) ++ * ++ * Example: ++ * ++ * CPU0 (schedule) CPU1 (try_to_wake_up) CPU2 (schedule) ++ * ++ * LOCK rq(0)->lock LOCK X->pi_lock ++ * dequeue X ++ * sched-out X ++ * smp_store_release(X->on_cpu, 0); ++ * ++ * smp_cond_load_acquire(&X->on_cpu, !VAL); ++ * X->state = WAKING ++ * set_task_cpu(X,2) ++ * ++ * LOCK rq(2)->lock ++ * enqueue X ++ * X->state = RUNNING ++ * UNLOCK rq(2)->lock ++ * ++ * LOCK rq(2)->lock // orders against CPU1 ++ * sched-out Z ++ * sched-in X ++ * UNLOCK rq(2)->lock ++ * ++ * UNLOCK X->pi_lock ++ * UNLOCK rq(0)->lock ++ * ++ * ++ * However; for wakeups there is a second guarantee we must provide, namely we ++ * must observe the state that lead to our wakeup. That is, not only must our ++ * task observe its own prior state, it must also observe the stores prior to ++ * its wakeup. ++ * ++ * This means that any means of doing remote wakeups must order the CPU doing ++ * the wakeup against the CPU the task is going to end up running on. This, ++ * however, is already required for the regular Program-Order guarantee above, ++ * since the waking CPU is the one issueing the ACQUIRE (smp_cond_load_acquire). ++ * ++ */ ++ ++/*** ++ * try_to_wake_up - wake up a thread ++ * @p: the thread to be awakened ++ * @state: the mask of task states that can be woken ++ * @wake_flags: wake modifier flags (WF_*) ++ * ++ * Put it on the run-queue if it's not already there. The "current" ++ * thread is always on the run-queue (except when the actual ++ * re-schedule is in progress), and as such you're allowed to do ++ * the simpler "current->state = TASK_RUNNING" to mark yourself ++ * runnable without the overhead of this. ++ * ++ * Return: %true if @p was woken up, %false if it was already running. ++ * or @state didn't match @p's state. ++ */ ++static int try_to_wake_up(struct task_struct *p, unsigned int state, ++ int wake_flags) ++{ ++ unsigned long flags; ++ struct rq *rq; ++ int cpu, success = 0; ++ ++ /* ++ * If we are going to wake up a thread waiting for CONDITION we ++ * need to ensure that CONDITION=1 done by the caller can not be ++ * reordered with p->state check below. This pairs with mb() in ++ * set_current_state() the waiting thread does. ++ */ ++ raw_spin_lock_irqsave(&p->pi_lock, flags); ++ smp_mb__after_spinlock(); ++ if (!(p->state & state)) ++ goto out; ++ ++ trace_sched_waking(p); ++ ++ /* We're going to change ->state: */ ++ success = 1; ++ cpu = task_cpu(p); ++ ++ /* ++ * Ensure we load p->on_rq _after_ p->state, otherwise it would ++ * be possible to, falsely, observe p->on_rq == 0 and get stuck ++ * in smp_cond_load_acquire() below. ++ * ++ * sched_ttwu_pending() try_to_wake_up() ++ * STORE p->on_rq = 1 LOAD p->state ++ * UNLOCK rq->lock ++ * ++ * __schedule() (switch to task 'p') ++ * LOCK rq->lock smp_rmb(); ++ * smp_mb__after_spinlock(); ++ * UNLOCK rq->lock ++ * ++ * [task p] ++ * STORE p->state = UNINTERRUPTIBLE LOAD p->on_rq ++ * ++ * Pairs with the LOCK+smp_mb__after_spinlock() on rq->lock in ++ * __schedule(). See the comment for smp_mb__after_spinlock(). ++ */ ++ smp_rmb(); ++ if (p->on_rq && ttwu_remote(p, wake_flags)) ++ goto stat; ++ ++#ifdef CONFIG_SMP ++ /* ++ * Ensure we load p->on_cpu _after_ p->on_rq, otherwise it would be ++ * possible to, falsely, observe p->on_cpu == 0. ++ * ++ * One must be running (->on_cpu == 1) in order to remove oneself ++ * from the runqueue. ++ * ++ * __schedule() (switch to task 'p') try_to_wake_up() ++ * STORE p->on_cpu = 1 LOAD p->on_rq ++ * UNLOCK rq->lock ++ * ++ * __schedule() (put 'p' to sleep) ++ * LOCK rq->lock smp_rmb(); ++ * smp_mb__after_spinlock(); ++ * STORE p->on_rq = 0 LOAD p->on_cpu ++ * ++ * Pairs with the LOCK+smp_mb__after_spinlock() on rq->lock in ++ * __schedule(). See the comment for smp_mb__after_spinlock(). ++ */ ++ smp_rmb(); ++ ++ /* ++ * If the owning (remote) CPU is still in the middle of schedule() with ++ * this task as prev, wait until its done referencing the task. ++ * ++ * Pairs with the smp_store_release() in finish_task(). ++ * ++ * This ensures that tasks getting woken will be fully ordered against ++ * their previous state and preserve Program Order. ++ */ ++ smp_cond_load_acquire(&p->on_cpu, !VAL); ++ ++ p->sched_contributes_to_load = !!task_contributes_to_load(p); ++ p->state = TASK_WAKING; ++ ++ if (p->in_iowait) { ++ delayacct_blkio_end(p); ++ atomic_dec(&task_rq(p)->nr_iowait); ++ } ++ ++ if (SCHED_ISO == p->policy && ISO_PRIO != p->prio) { ++ p->prio = ISO_PRIO; ++ p->deadline = 0UL; ++ update_task_priodl(p); ++ } ++ ++ cpu = select_task_rq(p); ++ ++ if (cpu != task_cpu(p)) { ++ wake_flags |= WF_MIGRATED; ++ psi_ttwu_dequeue(p); ++ set_task_cpu(p, cpu); ++ } ++#else /* CONFIG_SMP */ ++ if (p->in_iowait) { ++ delayacct_blkio_end(p); ++ atomic_dec(&task_rq(p)->nr_iowait); ++ } ++#endif ++ ++ rq = cpu_rq(cpu); ++ raw_spin_lock(&rq->lock); ++ ++ update_rq_clock(rq); ++ ttwu_do_activate(rq, p, wake_flags); ++ check_preempt_curr(rq, p); ++ ++ raw_spin_unlock(&rq->lock); ++ ++stat: ++ ttwu_stat(p, cpu, wake_flags); ++out: ++ raw_spin_unlock_irqrestore(&p->pi_lock, flags); ++ ++ return success; ++} ++ ++/** ++ * wake_up_process - Wake up a specific process ++ * @p: The process to be woken up. ++ * ++ * Attempt to wake up the nominated process and move it to the set of runnable ++ * processes. ++ * ++ * Return: 1 if the process was woken up, 0 if it was already running. ++ * ++ * This function executes a full memory barrier before accessing the task state. ++ */ ++int wake_up_process(struct task_struct *p) ++{ ++ return try_to_wake_up(p, TASK_NORMAL, 0); ++} ++EXPORT_SYMBOL(wake_up_process); ++ ++int wake_up_state(struct task_struct *p, unsigned int state) ++{ ++ return try_to_wake_up(p, state, 0); ++} ++ ++/* ++ * Perform scheduler related setup for a newly forked process p. ++ * p is forked by current. ++ */ ++int sched_fork(unsigned long __maybe_unused clone_flags, struct task_struct *p) ++{ ++ unsigned long flags; ++ int cpu = get_cpu(); ++ struct rq *rq = this_rq(); ++ ++#ifdef CONFIG_PREEMPT_NOTIFIERS ++ INIT_HLIST_HEAD(&p->preempt_notifiers); ++#endif ++ /* Should be reset in fork.c but done here for ease of PDS patching */ ++ p->on_cpu = ++ p->on_rq = ++ p->utime = ++ p->stime = ++ p->sched_time = 0; ++ ++ p->sl_level = pds_skiplist_random_level(p); ++ INIT_SKIPLIST_NODE(&p->sl_node); ++ ++#ifdef CONFIG_COMPACTION ++ p->capture_control = NULL; ++#endif ++ ++ /* ++ * We mark the process as NEW here. This guarantees that ++ * nobody will actually run it, and a signal or other external ++ * event cannot wake it up and insert it on the runqueue either. ++ */ ++ p->state = TASK_NEW; ++ ++ /* ++ * Make sure we do not leak PI boosting priority to the child. ++ */ ++ p->prio = current->normal_prio; ++ ++ /* ++ * Revert to default priority/policy on fork if requested. ++ */ ++ if (unlikely(p->sched_reset_on_fork)) { ++ if (task_has_rt_policy(p)) { ++ p->policy = SCHED_NORMAL; ++ p->static_prio = NICE_TO_PRIO(0); ++ p->rt_priority = 0; ++ } else if (PRIO_TO_NICE(p->static_prio) < 0) ++ p->static_prio = NICE_TO_PRIO(0); ++ ++ p->prio = p->normal_prio = normal_prio(p); ++ ++ /* ++ * We don't need the reset flag anymore after the fork. It has ++ * fulfilled its duty: ++ */ ++ p->sched_reset_on_fork = 0; ++ } ++ ++ /* ++ * Share the timeslice between parent and child, thus the ++ * total amount of pending timeslices in the system doesn't change, ++ * resulting in more scheduling fairness. ++ */ ++ raw_spin_lock_irqsave(&rq->lock, flags); ++ rq->curr->time_slice /= 2; ++ p->time_slice = rq->curr->time_slice; ++#ifdef CONFIG_SCHED_HRTICK ++ hrtick_start(rq, US_TO_NS(rq->curr->time_slice)); ++#endif ++ ++ if (p->time_slice < RESCHED_US) { ++ update_rq_clock(rq); ++ time_slice_expired(p, rq); ++ resched_curr(rq); ++ } else ++ update_task_priodl(p); ++ raw_spin_unlock_irqrestore(&rq->lock, flags); ++ ++ /* ++ * The child is not yet in the pid-hash so no cgroup attach races, ++ * and the cgroup is pinned to this child due to cgroup_fork() ++ * is ran before sched_fork(). ++ * ++ * Silence PROVE_RCU. ++ */ ++ raw_spin_lock_irqsave(&p->pi_lock, flags); ++ /* ++ * We're setting the CPU for the first time, we don't migrate, ++ * so use __set_task_cpu(). ++ */ ++ __set_task_cpu(p, cpu); ++ raw_spin_unlock_irqrestore(&p->pi_lock, flags); ++ ++#ifdef CONFIG_SCHED_INFO ++ if (unlikely(sched_info_on())) ++ memset(&p->sched_info, 0, sizeof(p->sched_info)); ++#endif ++ init_task_preempt_count(p); ++ ++ put_cpu(); ++ return 0; ++} ++ ++#ifdef CONFIG_SCHEDSTATS ++ ++DEFINE_STATIC_KEY_FALSE(sched_schedstats); ++static bool __initdata __sched_schedstats = false; ++ ++static void set_schedstats(bool enabled) ++{ ++ if (enabled) ++ static_branch_enable(&sched_schedstats); ++ else ++ static_branch_disable(&sched_schedstats); ++} ++ ++void force_schedstat_enabled(void) ++{ ++ if (!schedstat_enabled()) { ++ pr_info("kernel profiling enabled schedstats, disable via kernel.sched_schedstats.\n"); ++ static_branch_enable(&sched_schedstats); ++ } ++} ++ ++static int __init setup_schedstats(char *str) ++{ ++ int ret = 0; ++ if (!str) ++ goto out; ++ ++ /* ++ * This code is called before jump labels have been set up, so we can't ++ * change the static branch directly just yet. Instead set a temporary ++ * variable so init_schedstats() can do it later. ++ */ ++ if (!strcmp(str, "enable")) { ++ __sched_schedstats = true; ++ ret = 1; ++ } else if (!strcmp(str, "disable")) { ++ __sched_schedstats = false; ++ ret = 1; ++ } ++out: ++ if (!ret) ++ pr_warn("Unable to parse schedstats=\n"); ++ ++ return ret; ++} ++__setup("schedstats=", setup_schedstats); ++ ++static void __init init_schedstats(void) ++{ ++ set_schedstats(__sched_schedstats); ++} ++ ++#ifdef CONFIG_PROC_SYSCTL ++int sysctl_schedstats(struct ctl_table *table, int write, ++ void __user *buffer, size_t *lenp, loff_t *ppos) ++{ ++ struct ctl_table t; ++ int err; ++ int state = static_branch_likely(&sched_schedstats); ++ ++ if (write && !capable(CAP_SYS_ADMIN)) ++ return -EPERM; ++ ++ t = *table; ++ t.data = &state; ++ err = proc_dointvec_minmax(&t, write, buffer, lenp, ppos); ++ if (err < 0) ++ return err; ++ if (write) ++ set_schedstats(state); ++ return err; ++} ++#endif /* CONFIG_PROC_SYSCTL */ ++#else /* !CONFIG_SCHEDSTATS */ ++static inline void init_schedstats(void) {} ++#endif /* CONFIG_SCHEDSTATS */ ++ ++/* ++ * wake_up_new_task - wake up a newly created task for the first time. ++ * ++ * This function will do some initial scheduler statistics housekeeping ++ * that must be done for every newly created context, then puts the task ++ * on the runqueue and wakes it. ++ */ ++void wake_up_new_task(struct task_struct *p) ++{ ++ unsigned long flags; ++ struct rq *rq; ++ ++ raw_spin_lock_irqsave(&p->pi_lock, flags); ++ ++ p->state = TASK_RUNNING; ++ ++ rq = cpu_rq(select_task_rq(p)); ++#ifdef CONFIG_SMP ++ /* ++ * Fork balancing, do it here and not earlier because: ++ * - cpus_mask can change in the fork path ++ * - any previously selected CPU might disappear through hotplug ++ * Use __set_task_cpu() to avoid calling sched_class::migrate_task_rq, ++ * as we're not fully set-up yet. ++ */ ++ __set_task_cpu(p, cpu_of(rq)); ++#endif ++ ++ raw_spin_lock(&rq->lock); ++ ++ update_rq_clock(rq); ++ activate_task(p, rq); ++ trace_sched_wakeup_new(p); ++ check_preempt_curr(rq, p); ++ ++ raw_spin_unlock(&rq->lock); ++ raw_spin_unlock_irqrestore(&p->pi_lock, flags); ++} ++ ++#ifdef CONFIG_PREEMPT_NOTIFIERS ++ ++static DEFINE_STATIC_KEY_FALSE(preempt_notifier_key); ++ ++void preempt_notifier_inc(void) ++{ ++ static_branch_inc(&preempt_notifier_key); ++} ++EXPORT_SYMBOL_GPL(preempt_notifier_inc); ++ ++void preempt_notifier_dec(void) ++{ ++ static_branch_dec(&preempt_notifier_key); ++} ++EXPORT_SYMBOL_GPL(preempt_notifier_dec); ++ ++/** ++ * preempt_notifier_register - tell me when current is being preempted & rescheduled ++ * @notifier: notifier struct to register ++ */ ++void preempt_notifier_register(struct preempt_notifier *notifier) ++{ ++ if (!static_branch_unlikely(&preempt_notifier_key)) ++ WARN(1, "registering preempt_notifier while notifiers disabled\n"); ++ ++ hlist_add_head(¬ifier->link, ¤t->preempt_notifiers); ++} ++EXPORT_SYMBOL_GPL(preempt_notifier_register); ++ ++/** ++ * preempt_notifier_unregister - no longer interested in preemption notifications ++ * @notifier: notifier struct to unregister ++ * ++ * This is *not* safe to call from within a preemption notifier. ++ */ ++void preempt_notifier_unregister(struct preempt_notifier *notifier) ++{ ++ hlist_del(¬ifier->link); ++} ++EXPORT_SYMBOL_GPL(preempt_notifier_unregister); ++ ++static void __fire_sched_in_preempt_notifiers(struct task_struct *curr) ++{ ++ struct preempt_notifier *notifier; ++ ++ hlist_for_each_entry(notifier, &curr->preempt_notifiers, link) ++ notifier->ops->sched_in(notifier, raw_smp_processor_id()); ++} ++ ++static __always_inline void fire_sched_in_preempt_notifiers(struct task_struct *curr) ++{ ++ if (static_branch_unlikely(&preempt_notifier_key)) ++ __fire_sched_in_preempt_notifiers(curr); ++} ++ ++static void ++__fire_sched_out_preempt_notifiers(struct task_struct *curr, ++ struct task_struct *next) ++{ ++ struct preempt_notifier *notifier; ++ ++ hlist_for_each_entry(notifier, &curr->preempt_notifiers, link) ++ notifier->ops->sched_out(notifier, next); ++} ++ ++static __always_inline void ++fire_sched_out_preempt_notifiers(struct task_struct *curr, ++ struct task_struct *next) ++{ ++ if (static_branch_unlikely(&preempt_notifier_key)) ++ __fire_sched_out_preempt_notifiers(curr, next); ++} ++ ++#else /* !CONFIG_PREEMPT_NOTIFIERS */ ++ ++static inline void fire_sched_in_preempt_notifiers(struct task_struct *curr) ++{ ++} ++ ++static inline void ++fire_sched_out_preempt_notifiers(struct task_struct *curr, ++ struct task_struct *next) ++{ ++} ++ ++#endif /* CONFIG_PREEMPT_NOTIFIERS */ ++ ++static inline void prepare_task(struct task_struct *next) ++{ ++ /* ++ * Claim the task as running, we do this before switching to it ++ * such that any running task will have this set. ++ */ ++ next->on_cpu = 1; ++} ++ ++static inline void finish_task(struct task_struct *prev) ++{ ++#ifdef CONFIG_SMP ++ /* ++ * After ->on_cpu is cleared, the task can be moved to a different CPU. ++ * We must ensure this doesn't happen until the switch is completely ++ * finished. ++ * ++ * In particular, the load of prev->state in finish_task_switch() must ++ * happen before this. ++ * ++ * Pairs with the smp_cond_load_acquire() in try_to_wake_up(). ++ */ ++ smp_store_release(&prev->on_cpu, 0); ++#else ++ prev->on_cpu = 0; ++#endif ++} ++ ++static inline void ++prepare_lock_switch(struct rq *rq, struct task_struct *next) ++{ ++ /* ++ * Since the runqueue lock will be released by the next ++ * task (which is an invalid locking op but in the case ++ * of the scheduler it's an obvious special-case), so we ++ * do an early lockdep release here: ++ */ ++ spin_release(&rq->lock.dep_map, 1, _THIS_IP_); ++#ifdef CONFIG_DEBUG_SPINLOCK ++ /* this is a valid case when another task releases the spinlock */ ++ rq->lock.owner = next; ++#endif ++} ++ ++static inline void finish_lock_switch(struct rq *rq) ++{ ++ /* ++ * If we are tracking spinlock dependencies then we have to ++ * fix up the runqueue lock - which gets 'carried over' from ++ * prev into current: ++ */ ++ spin_acquire(&rq->lock.dep_map, 0, 0, _THIS_IP_); ++ raw_spin_unlock_irq(&rq->lock); ++} ++ ++/** ++ * prepare_task_switch - prepare to switch tasks ++ * @rq: the runqueue preparing to switch ++ * @next: the task we are going to switch to. ++ * ++ * This is called with the rq lock held and interrupts off. It must ++ * be paired with a subsequent finish_task_switch after the context ++ * switch. ++ * ++ * prepare_task_switch sets up locking and calls architecture specific ++ * hooks. ++ */ ++static inline void ++prepare_task_switch(struct rq *rq, struct task_struct *prev, ++ struct task_struct *next) ++{ ++ kcov_prepare_switch(prev); ++ sched_info_switch(rq, prev, next); ++ perf_event_task_sched_out(prev, next); ++ rseq_preempt(prev); ++ fire_sched_out_preempt_notifiers(prev, next); ++ prepare_task(next); ++ prepare_arch_switch(next); ++} ++ ++/** ++ * finish_task_switch - clean up after a task-switch ++ * @rq: runqueue associated with task-switch ++ * @prev: the thread we just switched away from. ++ * ++ * finish_task_switch must be called after the context switch, paired ++ * with a prepare_task_switch call before the context switch. ++ * finish_task_switch will reconcile locking set up by prepare_task_switch, ++ * and do any other architecture-specific cleanup actions. ++ * ++ * Note that we may have delayed dropping an mm in context_switch(). If ++ * so, we finish that here outside of the runqueue lock. (Doing it ++ * with the lock held can cause deadlocks; see schedule() for ++ * details.) ++ * ++ * The context switch have flipped the stack from under us and restored the ++ * local variables which were saved when this task called schedule() in the ++ * past. prev == current is still correct but we need to recalculate this_rq ++ * because prev may have moved to another CPU. ++ */ ++static struct rq *finish_task_switch(struct task_struct *prev) ++ __releases(rq->lock) ++{ ++ struct rq *rq = this_rq(); ++ struct mm_struct *mm = rq->prev_mm; ++ long prev_state; ++ ++ /* ++ * The previous task will have left us with a preempt_count of 2 ++ * because it left us after: ++ * ++ * schedule() ++ * preempt_disable(); // 1 ++ * __schedule() ++ * raw_spin_lock_irq(&rq->lock) // 2 ++ * ++ * Also, see FORK_PREEMPT_COUNT. ++ */ ++ if (WARN_ONCE(preempt_count() != 2*PREEMPT_DISABLE_OFFSET, ++ "corrupted preempt_count: %s/%d/0x%x\n", ++ current->comm, current->pid, preempt_count())) ++ preempt_count_set(FORK_PREEMPT_COUNT); ++ ++ rq->prev_mm = NULL; ++ ++ /* ++ * A task struct has one reference for the use as "current". ++ * If a task dies, then it sets TASK_DEAD in tsk->state and calls ++ * schedule one last time. The schedule call will never return, and ++ * the scheduled task must drop that reference. ++ * ++ * We must observe prev->state before clearing prev->on_cpu (in ++ * finish_task), otherwise a concurrent wakeup can get prev ++ * running on another CPU and we could rave with its RUNNING -> DEAD ++ * transition, resulting in a double drop. ++ */ ++ prev_state = prev->state; ++ vtime_task_switch(prev); ++ perf_event_task_sched_in(prev, current); ++ finish_task(prev); ++ finish_lock_switch(rq); ++ finish_arch_post_lock_switch(); ++ kcov_finish_switch(current); ++ ++ fire_sched_in_preempt_notifiers(current); ++ /* ++ * When switching through a kernel thread, the loop in ++ * membarrier_{private,global}_expedited() may have observed that ++ * kernel thread and not issued an IPI. It is therefore possible to ++ * schedule between user->kernel->user threads without passing though ++ * switch_mm(). Membarrier requires a barrier after storing to ++ * rq->curr, before returning to userspace, so provide them here: ++ * ++ * - a full memory barrier for {PRIVATE,GLOBAL}_EXPEDITED, implicitly ++ * provided by mmdrop(), ++ * - a sync_core for SYNC_CORE. ++ */ ++ if (mm) { ++ membarrier_mm_sync_core_before_usermode(mm); ++ mmdrop(mm); ++ } ++ if (unlikely(prev_state == TASK_DEAD)) { ++ /* ++ * Remove function-return probe instances associated with this ++ * task and put them back on the free list. ++ */ ++ kprobe_flush_task(prev); ++ ++ /* Task is done with its stack. */ ++ put_task_stack(prev); ++ ++ put_task_struct_rcu_user(prev); ++ } ++ ++ tick_nohz_task_switch(); ++ return rq; ++} ++ ++/** ++ * schedule_tail - first thing a freshly forked thread must call. ++ * @prev: the thread we just switched away from. ++ */ ++asmlinkage __visible void schedule_tail(struct task_struct *prev) ++ __releases(rq->lock) ++{ ++ struct rq *rq; ++ ++ /* ++ * New tasks start with FORK_PREEMPT_COUNT, see there and ++ * finish_task_switch() for details. ++ * ++ * finish_task_switch() will drop rq->lock() and lower preempt_count ++ * and the preempt_enable() will end up enabling preemption (on ++ * PREEMPT_COUNT kernels). ++ */ ++ ++ rq = finish_task_switch(prev); ++ preempt_enable(); ++ ++ if (current->set_child_tid) ++ put_user(task_pid_vnr(current), current->set_child_tid); ++ ++ calculate_sigpending(); ++} ++ ++/* ++ * context_switch - switch to the new MM and the new thread's register state. ++ */ ++static __always_inline struct rq * ++context_switch(struct rq *rq, struct task_struct *prev, ++ struct task_struct *next) ++{ ++ prepare_task_switch(rq, prev, next); ++ ++ /* ++ * For paravirt, this is coupled with an exit in switch_to to ++ * combine the page table reload and the switch backend into ++ * one hypercall. ++ */ ++ arch_start_context_switch(prev); ++ ++ /* ++ * kernel -> kernel lazy + transfer active ++ * user -> kernel lazy + mmgrab() active ++ * ++ * kernel -> user switch + mmdrop() active ++ * user -> user switch ++ */ ++ if (!next->mm) { // to kernel ++ enter_lazy_tlb(prev->active_mm, next); ++ ++ next->active_mm = prev->active_mm; ++ if (prev->mm) // from user ++ mmgrab(prev->active_mm); ++ else ++ prev->active_mm = NULL; ++ } else { // to user ++ membarrier_switch_mm(rq, prev->active_mm, next->mm); ++ /* ++ * sys_membarrier() requires an smp_mb() between setting ++ * rq->curr / membarrier_switch_mm() and returning to userspace. ++ * ++ * The below provides this either through switch_mm(), or in ++ * case 'prev->active_mm == next->mm' through ++ * finish_task_switch()'s mmdrop(). ++ */ ++ switch_mm_irqs_off(prev->active_mm, next->mm, next); ++ ++ if (!prev->mm) { // from kernel ++ /* will mmdrop() in finish_task_switch(). */ ++ rq->prev_mm = prev->active_mm; ++ prev->active_mm = NULL; ++ } ++ } ++ ++ prepare_lock_switch(rq, next); ++ ++ /* Here we just switch the register state and the stack. */ ++ switch_to(prev, next, prev); ++ barrier(); ++ ++ return finish_task_switch(prev); ++} ++ ++/* ++ * nr_running, nr_uninterruptible and nr_context_switches: ++ * ++ * externally visible scheduler statistics: current number of runnable ++ * threads, total number of context switches performed since bootup. ++ */ ++unsigned long nr_running(void) ++{ ++ unsigned long i, sum = 0; ++ ++ for_each_online_cpu(i) ++ sum += cpu_rq(i)->nr_running; ++ ++ return sum; ++} ++ ++/* ++ * Check if only the current task is running on the CPU. ++ * ++ * Caution: this function does not check that the caller has disabled ++ * preemption, thus the result might have a time-of-check-to-time-of-use ++ * race. The caller is responsible to use it correctly, for example: ++ * ++ * - from a non-preemptible section (of course) ++ * ++ * - from a thread that is bound to a single CPU ++ * ++ * - in a loop with very short iterations (e.g. a polling loop) ++ */ ++bool single_task_running(void) ++{ ++ return raw_rq()->nr_running == 1; ++} ++EXPORT_SYMBOL(single_task_running); ++ ++unsigned long long nr_context_switches(void) ++{ ++ int i; ++ unsigned long long sum = 0; ++ ++ for_each_possible_cpu(i) ++ sum += cpu_rq(i)->nr_switches; ++ ++ return sum; ++} ++ ++/* ++ * Consumers of these two interfaces, like for example the cpuidle menu ++ * governor, are using nonsensical data. Preferring shallow idle state selection ++ * for a CPU that has IO-wait which might not even end up running the task when ++ * it does become runnable. ++ */ ++ ++unsigned long nr_iowait_cpu(int cpu) ++{ ++ return atomic_read(&cpu_rq(cpu)->nr_iowait); ++} ++ ++/* ++ * IO-wait accounting, and how its mostly bollocks (on SMP). ++ * ++ * The idea behind IO-wait account is to account the idle time that we could ++ * have spend running if it were not for IO. That is, if we were to improve the ++ * storage performance, we'd have a proportional reduction in IO-wait time. ++ * ++ * This all works nicely on UP, where, when a task blocks on IO, we account ++ * idle time as IO-wait, because if the storage were faster, it could've been ++ * running and we'd not be idle. ++ * ++ * This has been extended to SMP, by doing the same for each CPU. This however ++ * is broken. ++ * ++ * Imagine for instance the case where two tasks block on one CPU, only the one ++ * CPU will have IO-wait accounted, while the other has regular idle. Even ++ * though, if the storage were faster, both could've ran at the same time, ++ * utilising both CPUs. ++ * ++ * This means, that when looking globally, the current IO-wait accounting on ++ * SMP is a lower bound, by reason of under accounting. ++ * ++ * Worse, since the numbers are provided per CPU, they are sometimes ++ * interpreted per CPU, and that is nonsensical. A blocked task isn't strictly ++ * associated with any one particular CPU, it can wake to another CPU than it ++ * blocked on. This means the per CPU IO-wait number is meaningless. ++ * ++ * Task CPU affinities can make all that even more 'interesting'. ++ */ ++ ++unsigned long nr_iowait(void) ++{ ++ unsigned long i, sum = 0; ++ ++ for_each_possible_cpu(i) ++ sum += nr_iowait_cpu(i); ++ ++ return sum; ++} ++ ++DEFINE_PER_CPU(struct kernel_stat, kstat); ++DEFINE_PER_CPU(struct kernel_cpustat, kernel_cpustat); ++ ++EXPORT_PER_CPU_SYMBOL(kstat); ++EXPORT_PER_CPU_SYMBOL(kernel_cpustat); ++ ++static inline void pds_update_curr(struct rq *rq, struct task_struct *p) ++{ ++ s64 ns = rq->clock_task - p->last_ran; ++ ++ p->sched_time += ns; ++ account_group_exec_runtime(p, ns); ++ ++ /* time_slice accounting is done in usecs to avoid overflow on 32bit */ ++ p->time_slice -= NS_TO_US(ns); ++ p->last_ran = rq->clock_task; ++} ++ ++/* ++ * Return accounted runtime for the task. ++ * Return separately the current's pending runtime that have not been ++ * accounted yet. ++ */ ++unsigned long long task_sched_runtime(struct task_struct *p) ++{ ++ unsigned long flags; ++ struct rq *rq; ++ raw_spinlock_t *lock; ++ u64 ns; ++ ++#if defined(CONFIG_64BIT) && defined(CONFIG_SMP) ++ /* ++ * 64-bit doesn't need locks to atomically read a 64-bit value. ++ * So we have a optimization chance when the task's delta_exec is 0. ++ * Reading ->on_cpu is racy, but this is ok. ++ * ++ * If we race with it leaving CPU, we'll take a lock. So we're correct. ++ * If we race with it entering CPU, unaccounted time is 0. This is ++ * indistinguishable from the read occurring a few cycles earlier. ++ * If we see ->on_cpu without ->on_rq, the task is leaving, and has ++ * been accounted, so we're correct here as well. ++ */ ++ if (!p->on_cpu || !task_on_rq_queued(p)) ++ return tsk_seruntime(p); ++#endif ++ ++ rq = task_access_lock_irqsave(p, &lock, &flags); ++ /* ++ * Must be ->curr _and_ ->on_rq. If dequeued, we would ++ * project cycles that may never be accounted to this ++ * thread, breaking clock_gettime(). ++ */ ++ if (p == rq->curr && task_on_rq_queued(p)) { ++ update_rq_clock(rq); ++ pds_update_curr(rq, p); ++ } ++ ns = tsk_seruntime(p); ++ task_access_unlock_irqrestore(p, lock, &flags); ++ ++ return ns; ++} ++ ++/* This manages tasks that have run out of timeslice during a scheduler_tick */ ++static inline void pds_scheduler_task_tick(struct rq *rq) ++{ ++ struct task_struct *p = rq->curr; ++ ++ if (is_idle_task(p)) ++ return; ++ ++ pds_update_curr(rq, p); ++ ++ cpufreq_update_util(rq, 0); ++ ++ /* ++ * Tasks that were scheduled in the first half of a tick are not ++ * allowed to run into the 2nd half of the next tick if they will ++ * run out of time slice in the interim. Otherwise, if they have ++ * less than RESCHED_US μs of time slice left they will be rescheduled. ++ */ ++ if (p->time_slice - rq->dither >= RESCHED_US) ++ return; ++ ++ /** ++ * p->time_slice < RESCHED_US. We will modify task_struct under ++ * rq lock as p is rq->curr ++ */ ++ __set_tsk_resched(p); ++} ++ ++#ifdef CONFIG_SMP ++ ++#ifdef CONFIG_SCHED_SMT ++static int active_load_balance_cpu_stop(void *data) ++{ ++ struct rq *rq = this_rq(); ++ struct task_struct *p = data; ++ int cpu; ++ unsigned long flags; ++ ++ local_irq_save(flags); ++ ++ raw_spin_lock(&p->pi_lock); ++ raw_spin_lock(&rq->lock); ++ ++ rq->active_balance = 0; ++ /* ++ * _something_ may have changed the task, double check again ++ */ ++ if (task_on_rq_queued(p) && task_rq(p) == rq && ++ (cpu = cpumask_any_and(&p->cpus_mask, &sched_cpu_sg_idle_mask)) < nr_cpu_ids) ++ rq = __migrate_task(rq, p, cpu); ++ ++ raw_spin_unlock(&rq->lock); ++ raw_spin_unlock(&p->pi_lock); ++ ++ local_irq_restore(flags); ++ ++ return 0; ++} ++ ++/* pds_sg_balance_trigger - trigger slibing group balance for @cpu */ ++static void pds_sg_balance_trigger(const int cpu) ++{ ++ struct rq *rq = cpu_rq(cpu); ++ unsigned long flags; ++ struct task_struct *curr; ++ ++ if (!raw_spin_trylock_irqsave(&rq->lock, flags)) ++ return; ++ curr = rq->curr; ++ if (!is_idle_task(curr) && ++ cpumask_intersects(&curr->cpus_mask, &sched_cpu_sg_idle_mask)) { ++ int active_balance = 0; ++ ++ if (likely(!rq->active_balance)) { ++ rq->active_balance = 1; ++ active_balance = 1; ++ } ++ ++ raw_spin_unlock_irqrestore(&rq->lock, flags); ++ ++ if (likely(active_balance)) ++ stop_one_cpu_nowait(cpu, active_load_balance_cpu_stop, ++ curr, &rq->active_balance_work); ++ } else ++ raw_spin_unlock_irqrestore(&rq->lock, flags); ++} ++ ++/* ++ * pds_sg_balance_check - slibing group balance check for run queue @rq ++ */ ++static inline void pds_sg_balance_check(const struct rq *rq) ++{ ++ cpumask_t chk; ++ int i; ++ ++ /* Only online cpu will do sg balance checking */ ++ if (unlikely(!rq->online)) ++ return; ++ ++ /* Only cpu in slibing idle group will do the checking */ ++ if (!cpumask_test_cpu(cpu_of(rq), &sched_cpu_sg_idle_mask)) ++ return; ++ ++ /* Find potential cpus which can migrate the currently running task */ ++ if (!cpumask_andnot(&chk, &sched_rq_pending_masks[SCHED_RQ_EMPTY], ++ &sched_rq_queued_masks[SCHED_RQ_EMPTY])) ++ return; ++ ++ for_each_cpu(i, &chk) { ++ /* skip the cpu which has idle slibing cpu */ ++ if (cpumask_test_cpu(per_cpu(sched_sibling_cpu, i), ++ &sched_rq_queued_masks[SCHED_RQ_EMPTY])) ++ continue; ++ pds_sg_balance_trigger(i); ++ } ++} ++#endif /* CONFIG_SCHED_SMT */ ++#endif /* CONFIG_SMP */ ++ ++/* ++ * This function gets called by the timer code, with HZ frequency. ++ * We call it with interrupts disabled. ++ */ ++void scheduler_tick(void) ++{ ++ int cpu __maybe_unused = smp_processor_id(); ++ struct rq *rq = cpu_rq(cpu); ++ ++ sched_clock_tick(); ++ ++ raw_spin_lock(&rq->lock); ++ update_rq_clock(rq); ++ ++ pds_scheduler_task_tick(rq); ++ update_sched_rq_queued_masks_normal(rq); ++ calc_global_load_tick(rq); ++ psi_task_tick(rq); ++ ++ rq->last_tick = rq->clock; ++ raw_spin_unlock(&rq->lock); ++ ++ perf_event_task_tick(); ++} ++ ++#ifdef CONFIG_NO_HZ_FULL ++struct tick_work { ++ int cpu; ++ atomic_t state; ++ struct delayed_work work; ++}; ++/* Values for ->state, see diagram below. */ ++#define TICK_SCHED_REMOTE_OFFLINE 0 ++#define TICK_SCHED_REMOTE_OFFLINING 1 ++#define TICK_SCHED_REMOTE_RUNNING 2 ++ ++/* ++ * State diagram for ->state: ++ * ++ * ++ * TICK_SCHED_REMOTE_OFFLINE ++ * | ^ ++ * | | ++ * | | sched_tick_remote() ++ * | | ++ * | | ++ * +--TICK_SCHED_REMOTE_OFFLINING ++ * | ^ ++ * | | ++ * sched_tick_start() | | sched_tick_stop() ++ * | | ++ * V | ++ * TICK_SCHED_REMOTE_RUNNING ++ * ++ * ++ * Other transitions get WARN_ON_ONCE(), except that sched_tick_remote() ++ * and sched_tick_start() are happy to leave the state in RUNNING. ++ */ ++ ++static struct tick_work __percpu *tick_work_cpu; ++ ++static void sched_tick_remote(struct work_struct *work) ++{ ++ struct delayed_work *dwork = to_delayed_work(work); ++ struct tick_work *twork = container_of(dwork, struct tick_work, work); ++ int cpu = twork->cpu; ++ struct rq *rq = cpu_rq(cpu); ++ struct task_struct *curr; ++ unsigned long flags; ++ u64 delta; ++ int os; ++ ++ /* ++ * Handle the tick only if it appears the remote CPU is running in full ++ * dynticks mode. The check is racy by nature, but missing a tick or ++ * having one too much is no big deal because the scheduler tick updates ++ * statistics and checks timeslices in a time-independent way, regardless ++ * of when exactly it is running. ++ */ ++ if (idle_cpu(cpu) || !tick_nohz_tick_stopped_cpu(cpu)) ++ goto out_requeue; ++ ++ raw_spin_lock_irqsave(&rq->lock, flags); ++ curr = rq->curr; ++ ++ if (is_idle_task(curr) || cpu_is_offline(cpu)) ++ goto out_unlock; ++ ++ update_rq_clock(rq); ++ delta = rq_clock_task(rq) - curr->last_ran; ++ ++ /* ++ * Make sure the next tick runs within a reasonable ++ * amount of time. ++ */ ++ WARN_ON_ONCE(delta > (u64)NSEC_PER_SEC * 3); ++ pds_scheduler_task_tick(rq); ++ update_sched_rq_queued_masks_normal(rq); ++ ++out_unlock: ++ raw_spin_unlock_irqrestore(&rq->lock, flags); ++ ++out_requeue: ++ /* ++ * Run the remote tick once per second (1Hz). This arbitrary ++ * frequency is large enough to avoid overload but short enough ++ * to keep scheduler internal stats reasonably up to date. But ++ * first update state to reflect hotplug activity if required. ++ */ ++ os = atomic_fetch_add_unless(&twork->state, -1, TICK_SCHED_REMOTE_RUNNING); ++ WARN_ON_ONCE(os == TICK_SCHED_REMOTE_OFFLINE); ++ if (os == TICK_SCHED_REMOTE_RUNNING) ++ queue_delayed_work(system_unbound_wq, dwork, HZ); ++} ++ ++static void sched_tick_start(int cpu) ++{ ++ int os; ++ struct tick_work *twork; ++ ++ if (housekeeping_cpu(cpu, HK_FLAG_TICK)) ++ return; ++ ++ WARN_ON_ONCE(!tick_work_cpu); ++ ++ twork = per_cpu_ptr(tick_work_cpu, cpu); ++ os = atomic_xchg(&twork->state, TICK_SCHED_REMOTE_RUNNING); ++ WARN_ON_ONCE(os == TICK_SCHED_REMOTE_RUNNING); ++ if (os == TICK_SCHED_REMOTE_OFFLINE) { ++ twork->cpu = cpu; ++ INIT_DELAYED_WORK(&twork->work, sched_tick_remote); ++ queue_delayed_work(system_unbound_wq, &twork->work, HZ); ++ } ++} ++ ++#ifdef CONFIG_HOTPLUG_CPU ++static void sched_tick_stop(int cpu) ++{ ++ struct tick_work *twork; ++ ++ if (housekeeping_cpu(cpu, HK_FLAG_TICK)) ++ return; ++ ++ WARN_ON_ONCE(!tick_work_cpu); ++ ++ twork = per_cpu_ptr(tick_work_cpu, cpu); ++ cancel_delayed_work_sync(&twork->work); ++} ++#endif /* CONFIG_HOTPLUG_CPU */ ++ ++int __init sched_tick_offload_init(void) ++{ ++ tick_work_cpu = alloc_percpu(struct tick_work); ++ BUG_ON(!tick_work_cpu); ++ return 0; ++} ++ ++#else /* !CONFIG_NO_HZ_FULL */ ++static inline void sched_tick_start(int cpu) { } ++static inline void sched_tick_stop(int cpu) { } ++#endif ++ ++#if defined(CONFIG_PREEMPTION) && (defined(CONFIG_DEBUG_PREEMPT) || \ ++ defined(CONFIG_PREEMPT_TRACER)) ++/* ++ * If the value passed in is equal to the current preempt count ++ * then we just disabled preemption. Start timing the latency. ++ */ ++static inline void preempt_latency_start(int val) ++{ ++ if (preempt_count() == val) { ++ unsigned long ip = get_lock_parent_ip(); ++#ifdef CONFIG_DEBUG_PREEMPT ++ current->preempt_disable_ip = ip; ++#endif ++ trace_preempt_off(CALLER_ADDR0, ip); ++ } ++} ++ ++void preempt_count_add(int val) ++{ ++#ifdef CONFIG_DEBUG_PREEMPT ++ /* ++ * Underflow? ++ */ ++ if (DEBUG_LOCKS_WARN_ON((preempt_count() < 0))) ++ return; ++#endif ++ __preempt_count_add(val); ++#ifdef CONFIG_DEBUG_PREEMPT ++ /* ++ * Spinlock count overflowing soon? ++ */ ++ DEBUG_LOCKS_WARN_ON((preempt_count() & PREEMPT_MASK) >= ++ PREEMPT_MASK - 10); ++#endif ++ preempt_latency_start(val); ++} ++EXPORT_SYMBOL(preempt_count_add); ++NOKPROBE_SYMBOL(preempt_count_add); ++ ++/* ++ * If the value passed in equals to the current preempt count ++ * then we just enabled preemption. Stop timing the latency. ++ */ ++static inline void preempt_latency_stop(int val) ++{ ++ if (preempt_count() == val) ++ trace_preempt_on(CALLER_ADDR0, get_lock_parent_ip()); ++} ++ ++void preempt_count_sub(int val) ++{ ++#ifdef CONFIG_DEBUG_PREEMPT ++ /* ++ * Underflow? ++ */ ++ if (DEBUG_LOCKS_WARN_ON(val > preempt_count())) ++ return; ++ /* ++ * Is the spinlock portion underflowing? ++ */ ++ if (DEBUG_LOCKS_WARN_ON((val < PREEMPT_MASK) && ++ !(preempt_count() & PREEMPT_MASK))) ++ return; ++#endif ++ ++ preempt_latency_stop(val); ++ __preempt_count_sub(val); ++} ++EXPORT_SYMBOL(preempt_count_sub); ++NOKPROBE_SYMBOL(preempt_count_sub); ++ ++#else ++static inline void preempt_latency_start(int val) { } ++static inline void preempt_latency_stop(int val) { } ++#endif ++ ++/* ++ * Timeslices below RESCHED_US are considered as good as expired as there's no ++ * point rescheduling when there's so little time left. SCHED_BATCH tasks ++ * have been flagged be not latency sensitive and likely to be fully CPU ++ * bound so every time they're rescheduled they have their time_slice ++ * refilled, but get a new later deadline to have little effect on ++ * SCHED_NORMAL tasks. ++ ++ */ ++static inline void check_deadline(struct task_struct *p, struct rq *rq) ++{ ++ if (rq->idle == p) ++ return; ++ ++ pds_update_curr(rq, p); ++ ++ if (p->time_slice < RESCHED_US) { ++ time_slice_expired(p, rq); ++ if (SCHED_ISO == p->policy && ISO_PRIO == p->prio) { ++ p->prio = NORMAL_PRIO; ++ p->deadline = rq->clock + task_deadline_diff(p); ++ update_task_priodl(p); ++ } ++ if (SCHED_FIFO != p->policy && task_on_rq_queued(p)) ++ requeue_task(p, rq); ++ } ++} ++ ++#ifdef CONFIG_SMP ++ ++#define SCHED_RQ_NR_MIGRATION (32UL) ++/* ++ * Migrate pending tasks in @rq to @dest_cpu ++ * Will try to migrate mininal of half of @rq nr_running tasks and ++ * SCHED_RQ_NR_MIGRATION to @dest_cpu ++ */ ++static inline int ++migrate_pending_tasks(struct rq *rq, struct rq *dest_rq, int filter_prio) ++{ ++ struct task_struct *p; ++ int dest_cpu = cpu_of(dest_rq); ++ int nr_migrated = 0; ++ int nr_tries = min((rq->nr_running + 1) / 2, SCHED_RQ_NR_MIGRATION); ++ struct skiplist_node *node = rq->sl_header.next[0]; ++ ++ while (nr_tries && node != &rq->sl_header) { ++ p = skiplist_entry(node, struct task_struct, sl_node); ++ node = node->next[0]; ++ ++ if (task_running(p)) ++ continue; ++ if (p->prio >= filter_prio) ++ break; ++ if (cpumask_test_cpu(dest_cpu, &p->cpus_mask)) { ++ detach_task(rq, p, dest_cpu); ++ attach_task(dest_rq, p); ++ nr_migrated++; ++ } ++ nr_tries--; ++ /* make a jump */ ++ if (node == &rq->sl_header) ++ break; ++ node = node->next[0]; ++ } ++ ++ return nr_migrated; ++} ++ ++static inline int ++take_queued_task_cpumask(struct rq *rq, cpumask_t *chk_mask, int filter_prio) ++{ ++ int src_cpu; ++ ++ for_each_cpu(src_cpu, chk_mask) { ++ int nr_migrated; ++ struct rq *src_rq = cpu_rq(src_cpu); ++ ++ if (!do_raw_spin_trylock(&src_rq->lock)) { ++ if (PRIO_LIMIT == filter_prio) ++ continue; ++ return 0; ++ } ++ spin_acquire(&src_rq->lock.dep_map, SINGLE_DEPTH_NESTING, 1, _RET_IP_); ++ ++ update_rq_clock(src_rq); ++ nr_migrated = migrate_pending_tasks(src_rq, rq, filter_prio); ++ ++ spin_release(&src_rq->lock.dep_map, 1, _RET_IP_); ++ do_raw_spin_unlock(&src_rq->lock); ++ ++ if (nr_migrated || PRIO_LIMIT != filter_prio) ++ return nr_migrated; ++ } ++ return 0; ++} ++ ++static inline int take_other_rq_task(struct rq *rq, int cpu, int filter_prio) ++{ ++ struct cpumask *affinity_mask, *end; ++ struct cpumask chk; ++ ++ if (PRIO_LIMIT == filter_prio) { ++ cpumask_complement(&chk, &sched_rq_pending_masks[SCHED_RQ_EMPTY]); ++#ifdef CONFIG_SMT_NICE ++ { ++ /* also try to take IDLE priority tasks from smt supressed cpu */ ++ struct cpumask t; ++ if (cpumask_and(&t, &sched_smt_supressed_mask, ++ &sched_rq_queued_masks[SCHED_RQ_IDLE])) ++ cpumask_or(&chk, &chk, &t); ++ } ++#endif ++ } else if (NORMAL_PRIO == filter_prio) { ++ cpumask_or(&chk, &sched_rq_pending_masks[SCHED_RQ_RT], ++ &sched_rq_pending_masks[SCHED_RQ_ISO]); ++ } else if (IDLE_PRIO == filter_prio) { ++ cpumask_complement(&chk, &sched_rq_pending_masks[SCHED_RQ_EMPTY]); ++ cpumask_andnot(&chk, &chk, &sched_rq_pending_masks[SCHED_RQ_IDLE]); ++ } else ++ cpumask_copy(&chk, &sched_rq_pending_masks[SCHED_RQ_RT]); ++ ++ if (cpumask_empty(&chk)) ++ return 0; ++ ++ affinity_mask = per_cpu(sched_cpu_llc_start_mask, cpu); ++ end = per_cpu(sched_cpu_affinity_chk_end_masks, cpu); ++ do { ++ struct cpumask tmp; ++ ++ if (cpumask_and(&tmp, &chk, affinity_mask) && ++ take_queued_task_cpumask(rq, &tmp, filter_prio)) ++ return 1; ++ } while (++affinity_mask < end); ++ ++ return 0; ++} ++#endif ++ ++static inline struct task_struct * ++choose_next_task(struct rq *rq, int cpu, struct task_struct *prev) ++{ ++ struct task_struct *next = rq_first_queued_task(rq); ++ ++#ifdef CONFIG_SMT_NICE ++ if (cpumask_test_cpu(cpu, &sched_smt_supressed_mask)) { ++ if (next->prio >= IDLE_PRIO) { ++ if (rq->online && ++ take_other_rq_task(rq, cpu, IDLE_PRIO)) ++ return rq_first_queued_task(rq); ++ return rq->idle; ++ } ++ } ++#endif ++ ++#ifdef CONFIG_SMP ++ if (likely(rq->online)) ++ if (take_other_rq_task(rq, cpu, next->prio)) { ++ resched_curr(rq); ++ return rq_first_queued_task(rq); ++ } ++#endif ++ return next; ++} ++ ++static inline unsigned long get_preempt_disable_ip(struct task_struct *p) ++{ ++#ifdef CONFIG_DEBUG_PREEMPT ++ return p->preempt_disable_ip; ++#else ++ return 0; ++#endif ++} ++ ++/* ++ * Print scheduling while atomic bug: ++ */ ++static noinline void __schedule_bug(struct task_struct *prev) ++{ ++ /* Save this before calling printk(), since that will clobber it */ ++ unsigned long preempt_disable_ip = get_preempt_disable_ip(current); ++ ++ if (oops_in_progress) ++ return; ++ ++ printk(KERN_ERR "BUG: scheduling while atomic: %s/%d/0x%08x\n", ++ prev->comm, prev->pid, preempt_count()); ++ ++ debug_show_held_locks(prev); ++ print_modules(); ++ if (irqs_disabled()) ++ print_irqtrace_events(prev); ++ if (IS_ENABLED(CONFIG_DEBUG_PREEMPT) ++ && in_atomic_preempt_off()) { ++ pr_err("Preemption disabled at:"); ++ print_ip_sym(preempt_disable_ip); ++ pr_cont("\n"); ++ } ++ if (panic_on_warn) ++ panic("scheduling while atomic\n"); ++ ++ dump_stack(); ++ add_taint(TAINT_WARN, LOCKDEP_STILL_OK); ++} ++ ++/* ++ * Various schedule()-time debugging checks and statistics: ++ */ ++static inline void schedule_debug(struct task_struct *prev, bool preempt) ++{ ++#ifdef CONFIG_SCHED_STACK_END_CHECK ++ if (task_stack_end_corrupted(prev)) ++ panic("corrupted stack end detected inside scheduler\n"); ++#endif ++ ++#ifdef CONFIG_DEBUG_ATOMIC_SLEEP ++ if (!preempt && prev->state && prev->non_block_count) { ++ printk(KERN_ERR "BUG: scheduling in a non-blocking section: %s/%d/%i\n", ++ prev->comm, prev->pid, prev->non_block_count); ++ dump_stack(); ++ add_taint(TAINT_WARN, LOCKDEP_STILL_OK); ++ } ++#endif ++ ++ if (unlikely(in_atomic_preempt_off())) { ++ __schedule_bug(prev); ++ preempt_count_set(PREEMPT_DISABLED); ++ } ++ rcu_sleep_check(); ++ ++ profile_hit(SCHED_PROFILING, __builtin_return_address(0)); ++ ++ schedstat_inc(this_rq()->sched_count); ++} ++ ++static inline void set_rq_task(struct rq *rq, struct task_struct *p) ++{ ++ p->last_ran = rq->clock_task; ++ ++#ifdef CONFIG_HIGH_RES_TIMERS ++ if (p != rq->idle) ++ hrtick_start(rq, US_TO_NS(p->time_slice)); ++#endif ++ /* update rq->dither */ ++ rq->dither = rq_dither(rq); ++} ++ ++/* ++ * schedule() is the main scheduler function. ++ * ++ * The main means of driving the scheduler and thus entering this function are: ++ * ++ * 1. Explicit blocking: mutex, semaphore, waitqueue, etc. ++ * ++ * 2. TIF_NEED_RESCHED flag is checked on interrupt and userspace return ++ * paths. For example, see arch/x86/entry_64.S. ++ * ++ * To drive preemption between tasks, the scheduler sets the flag in timer ++ * interrupt handler scheduler_tick(). ++ * ++ * 3. Wakeups don't really cause entry into schedule(). They add a ++ * task to the run-queue and that's it. ++ * ++ * Now, if the new task added to the run-queue preempts the current ++ * task, then the wakeup sets TIF_NEED_RESCHED and schedule() gets ++ * called on the nearest possible occasion: ++ * ++ * - If the kernel is preemptible (CONFIG_PREEMPTION=y): ++ * ++ * - in syscall or exception context, at the next outmost ++ * preempt_enable(). (this might be as soon as the wake_up()'s ++ * spin_unlock()!) ++ * ++ * - in IRQ context, return from interrupt-handler to ++ * preemptible context ++ * ++ * - If the kernel is not preemptible (CONFIG_PREEMPT is not set) ++ * then at the next: ++ * ++ * - cond_resched() call ++ * - explicit schedule() call ++ * - return from syscall or exception to user-space ++ * - return from interrupt-handler to user-space ++ * ++ * WARNING: must be called with preemption disabled! ++ */ ++static void __sched notrace __schedule(bool preempt) ++{ ++ struct task_struct *prev, *next; ++ unsigned long *switch_count; ++ struct rq *rq; ++ int cpu; ++ ++ cpu = smp_processor_id(); ++ rq = cpu_rq(cpu); ++ prev = rq->curr; ++ ++ schedule_debug(prev, preempt); ++ ++ /* by passing sched_feat(HRTICK) checking which PDS doesn't support */ ++ hrtick_clear(rq); ++ ++ local_irq_disable(); ++ rcu_note_context_switch(preempt); ++ ++ /* ++ * Make sure that signal_pending_state()->signal_pending() below ++ * can't be reordered with __set_current_state(TASK_INTERRUPTIBLE) ++ * done by the caller to avoid the race with signal_wake_up(). ++ * ++ * The membarrier system call requires a full memory barrier ++ * after coming from user-space, before storing to rq->curr. ++ */ ++ raw_spin_lock(&rq->lock); ++ smp_mb__after_spinlock(); ++ ++ update_rq_clock(rq); ++ ++ switch_count = &prev->nivcsw; ++ if (!preempt && prev->state) { ++ if (signal_pending_state(prev->state, prev)) { ++ prev->state = TASK_RUNNING; ++ } else { ++ deactivate_task(prev, rq); ++ ++ if (prev->in_iowait) { ++ atomic_inc(&rq->nr_iowait); ++ delayacct_blkio_start(); ++ } ++ } ++ switch_count = &prev->nvcsw; ++ } ++ ++ clear_tsk_need_resched(prev); ++ clear_preempt_need_resched(); ++ ++ check_deadline(prev, rq); ++ ++ next = choose_next_task(rq, cpu, prev); ++ ++ set_rq_task(rq, next); ++ ++ if (prev != next) { ++ if (next->prio == PRIO_LIMIT) ++ schedstat_inc(rq->sched_goidle); ++ ++ /* ++ * RCU users of rcu_dereference(rq->curr) may not see ++ * changes to task_struct made by pick_next_task(). ++ */ ++ RCU_INIT_POINTER(rq->curr, next); ++ /* ++ * The membarrier system call requires each architecture ++ * to have a full memory barrier after updating ++ * rq->curr, before returning to user-space. ++ * ++ * Here are the schemes providing that barrier on the ++ * various architectures: ++ * - mm ? switch_mm() : mmdrop() for x86, s390, sparc, PowerPC. ++ * switch_mm() rely on membarrier_arch_switch_mm() on PowerPC. ++ * - finish_lock_switch() for weakly-ordered ++ * architectures where spin_unlock is a full barrier, ++ * - switch_to() for arm64 (weakly-ordered, spin_unlock ++ * is a RELEASE barrier), ++ */ ++ ++*switch_count; ++ rq->nr_switches++; ++ ++ trace_sched_switch(preempt, prev, next); ++ ++ /* Also unlocks the rq: */ ++ rq = context_switch(rq, prev, next); ++#ifdef CONFIG_SCHED_SMT ++ pds_sg_balance_check(rq); ++#endif ++ } else ++ raw_spin_unlock_irq(&rq->lock); ++} ++ ++void __noreturn do_task_dead(void) ++{ ++ /* Causes final put_task_struct in finish_task_switch(): */ ++ set_special_state(TASK_DEAD); ++ ++ /* Tell freezer to ignore us: */ ++ current->flags |= PF_NOFREEZE; ++ __schedule(false); ++ ++ BUG(); ++ ++ /* Avoid "noreturn function does return" - but don't continue if BUG() is a NOP: */ ++ for (;;) ++ cpu_relax(); ++} ++ ++static inline void sched_submit_work(struct task_struct *tsk) ++{ ++ if (!tsk->state || tsk_is_pi_blocked(tsk) || ++ signal_pending_state(tsk->state, tsk)) ++ return; ++ ++ /* ++ * If a worker went to sleep, notify and ask workqueue whether ++ * it wants to wake up a task to maintain concurrency. ++ * As this function is called inside the schedule() context, ++ * we disable preemption to avoid it calling schedule() again ++ * in the possible wakeup of a kworker. ++ */ ++ if (tsk->flags & PF_WQ_WORKER) { ++ preempt_disable(); ++ wq_worker_sleeping(tsk); ++ preempt_enable_no_resched(); ++ } ++ ++ /* ++ * If we are going to sleep and we have plugged IO queued, ++ * make sure to submit it to avoid deadlocks. ++ */ ++ if (blk_needs_flush_plug(tsk)) ++ blk_schedule_flush_plug(tsk); ++} ++ ++static void sched_update_worker(struct task_struct *tsk) ++{ ++ if (tsk->flags & PF_WQ_WORKER) ++ wq_worker_running(tsk); ++} ++ ++asmlinkage __visible void __sched schedule(void) ++{ ++ struct task_struct *tsk = current; ++ ++ sched_submit_work(tsk); ++ do { ++ preempt_disable(); ++ __schedule(false); ++ sched_preempt_enable_no_resched(); ++ } while (need_resched()); ++ sched_update_worker(tsk); ++} ++EXPORT_SYMBOL(schedule); ++ ++/* ++ * synchronize_rcu_tasks() makes sure that no task is stuck in preempted ++ * state (have scheduled out non-voluntarily) by making sure that all ++ * tasks have either left the run queue or have gone into user space. ++ * As idle tasks do not do either, they must not ever be preempted ++ * (schedule out non-voluntarily). ++ * ++ * schedule_idle() is similar to schedule_preempt_disable() except that it ++ * never enables preemption because it does not call sched_submit_work(). ++ */ ++void __sched schedule_idle(void) ++{ ++ /* ++ * As this skips calling sched_submit_work(), which the idle task does ++ * regardless because that function is a nop when the task is in a ++ * TASK_RUNNING state, make sure this isn't used someplace that the ++ * current task can be in any other state. Note, idle is always in the ++ * TASK_RUNNING state. ++ */ ++ WARN_ON_ONCE(current->state); ++ do { ++ __schedule(false); ++ } while (need_resched()); ++} ++ ++#ifdef CONFIG_CONTEXT_TRACKING ++asmlinkage __visible void __sched schedule_user(void) ++{ ++ /* ++ * If we come here after a random call to set_need_resched(), ++ * or we have been woken up remotely but the IPI has not yet arrived, ++ * we haven't yet exited the RCU idle mode. Do it here manually until ++ * we find a better solution. ++ * ++ * NB: There are buggy callers of this function. Ideally we ++ * should warn if prev_state != CONTEXT_USER, but that will trigger ++ * too frequently to make sense yet. ++ */ ++ enum ctx_state prev_state = exception_enter(); ++ schedule(); ++ exception_exit(prev_state); ++} ++#endif ++ ++/** ++ * schedule_preempt_disabled - called with preemption disabled ++ * ++ * Returns with preemption disabled. Note: preempt_count must be 1 ++ */ ++void __sched schedule_preempt_disabled(void) ++{ ++ sched_preempt_enable_no_resched(); ++ schedule(); ++ preempt_disable(); ++} ++ ++static void __sched notrace preempt_schedule_common(void) ++{ ++ do { ++ /* ++ * Because the function tracer can trace preempt_count_sub() ++ * and it also uses preempt_enable/disable_notrace(), if ++ * NEED_RESCHED is set, the preempt_enable_notrace() called ++ * by the function tracer will call this function again and ++ * cause infinite recursion. ++ * ++ * Preemption must be disabled here before the function ++ * tracer can trace. Break up preempt_disable() into two ++ * calls. One to disable preemption without fear of being ++ * traced. The other to still record the preemption latency, ++ * which can also be traced by the function tracer. ++ */ ++ preempt_disable_notrace(); ++ preempt_latency_start(1); ++ __schedule(true); ++ preempt_latency_stop(1); ++ preempt_enable_no_resched_notrace(); ++ ++ /* ++ * Check again in case we missed a preemption opportunity ++ * between schedule and now. ++ */ ++ } while (need_resched()); ++} ++ ++#ifdef CONFIG_PREEMPTION ++/* ++ * This is the entry point to schedule() from in-kernel preemption ++ * off of preempt_enable. ++ */ ++asmlinkage __visible void __sched notrace preempt_schedule(void) ++{ ++ /* ++ * If there is a non-zero preempt_count or interrupts are disabled, ++ * we do not want to preempt the current task. Just return.. ++ */ ++ if (likely(!preemptible())) ++ return; ++ ++ preempt_schedule_common(); ++} ++NOKPROBE_SYMBOL(preempt_schedule); ++EXPORT_SYMBOL(preempt_schedule); ++ ++/** ++ * preempt_schedule_notrace - preempt_schedule called by tracing ++ * ++ * The tracing infrastructure uses preempt_enable_notrace to prevent ++ * recursion and tracing preempt enabling caused by the tracing ++ * infrastructure itself. But as tracing can happen in areas coming ++ * from userspace or just about to enter userspace, a preempt enable ++ * can occur before user_exit() is called. This will cause the scheduler ++ * to be called when the system is still in usermode. ++ * ++ * To prevent this, the preempt_enable_notrace will use this function ++ * instead of preempt_schedule() to exit user context if needed before ++ * calling the scheduler. ++ */ ++asmlinkage __visible void __sched notrace preempt_schedule_notrace(void) ++{ ++ enum ctx_state prev_ctx; ++ ++ if (likely(!preemptible())) ++ return; ++ ++ do { ++ /* ++ * Because the function tracer can trace preempt_count_sub() ++ * and it also uses preempt_enable/disable_notrace(), if ++ * NEED_RESCHED is set, the preempt_enable_notrace() called ++ * by the function tracer will call this function again and ++ * cause infinite recursion. ++ * ++ * Preemption must be disabled here before the function ++ * tracer can trace. Break up preempt_disable() into two ++ * calls. One to disable preemption without fear of being ++ * traced. The other to still record the preemption latency, ++ * which can also be traced by the function tracer. ++ */ ++ preempt_disable_notrace(); ++ preempt_latency_start(1); ++ /* ++ * Needs preempt disabled in case user_exit() is traced ++ * and the tracer calls preempt_enable_notrace() causing ++ * an infinite recursion. ++ */ ++ prev_ctx = exception_enter(); ++ __schedule(true); ++ exception_exit(prev_ctx); ++ ++ preempt_latency_stop(1); ++ preempt_enable_no_resched_notrace(); ++ } while (need_resched()); ++} ++EXPORT_SYMBOL_GPL(preempt_schedule_notrace); ++ ++#endif /* CONFIG_PREEMPTION */ ++ ++/* ++ * This is the entry point to schedule() from kernel preemption ++ * off of irq context. ++ * Note, that this is called and return with irqs disabled. This will ++ * protect us against recursive calling from irq. ++ */ ++asmlinkage __visible void __sched preempt_schedule_irq(void) ++{ ++ enum ctx_state prev_state; ++ ++ /* Catch callers which need to be fixed */ ++ BUG_ON(preempt_count() || !irqs_disabled()); ++ ++ prev_state = exception_enter(); ++ ++ do { ++ preempt_disable(); ++ local_irq_enable(); ++ __schedule(true); ++ local_irq_disable(); ++ sched_preempt_enable_no_resched(); ++ } while (need_resched()); ++ ++ exception_exit(prev_state); ++} ++ ++int default_wake_function(wait_queue_entry_t *curr, unsigned mode, int wake_flags, ++ void *key) ++{ ++ return try_to_wake_up(curr->private, mode, wake_flags); ++} ++EXPORT_SYMBOL(default_wake_function); ++ ++static inline void ++check_task_changed(struct rq *rq, struct task_struct *p) ++{ ++ /* ++ * Trigger changes when task priority/deadline modified. ++ */ ++ if (task_on_rq_queued(p)) { ++ struct task_struct *first; ++ ++ requeue_task(p, rq); ++ ++ /* Resched if first queued task not running and not IDLE */ ++ if ((first = rq_first_queued_task(rq)) != rq->curr && ++ !task_running_idle(first)) ++ resched_curr(rq); ++ } ++} ++ ++#ifdef CONFIG_RT_MUTEXES ++ ++static inline int __rt_effective_prio(struct task_struct *pi_task, int prio) ++{ ++ if (pi_task) ++ prio = min(prio, pi_task->prio); ++ ++ return prio; ++} ++ ++static inline int rt_effective_prio(struct task_struct *p, int prio) ++{ ++ struct task_struct *pi_task = rt_mutex_get_top_task(p); ++ ++ return __rt_effective_prio(pi_task, prio); ++} ++ ++/* ++ * rt_mutex_setprio - set the current priority of a task ++ * @p: task to boost ++ * @pi_task: donor task ++ * ++ * This function changes the 'effective' priority of a task. It does ++ * not touch ->normal_prio like __setscheduler(). ++ * ++ * Used by the rt_mutex code to implement priority inheritance ++ * logic. Call site only calls if the priority of the task changed. ++ */ ++void rt_mutex_setprio(struct task_struct *p, struct task_struct *pi_task) ++{ ++ int prio; ++ struct rq *rq; ++ raw_spinlock_t *lock; ++ ++ /* XXX used to be waiter->prio, not waiter->task->prio */ ++ prio = __rt_effective_prio(pi_task, p->normal_prio); ++ ++ /* ++ * If nothing changed; bail early. ++ */ ++ if (p->pi_top_task == pi_task && prio == p->prio) ++ return; ++ ++ rq = __task_access_lock(p, &lock); ++ /* ++ * Set under pi_lock && rq->lock, such that the value can be used under ++ * either lock. ++ * ++ * Note that there is loads of tricky to make this pointer cache work ++ * right. rt_mutex_slowunlock()+rt_mutex_postunlock() work together to ++ * ensure a task is de-boosted (pi_task is set to NULL) before the ++ * task is allowed to run again (and can exit). This ensures the pointer ++ * points to a blocked task -- which guaratees the task is present. ++ */ ++ p->pi_top_task = pi_task; ++ ++ /* ++ * For FIFO/RR we only need to set prio, if that matches we're done. ++ */ ++ if (prio == p->prio) ++ goto out_unlock; ++ ++ /* ++ * Idle task boosting is a nono in general. There is one ++ * exception, when PREEMPT_RT and NOHZ is active: ++ * ++ * The idle task calls get_next_timer_interrupt() and holds ++ * the timer wheel base->lock on the CPU and another CPU wants ++ * to access the timer (probably to cancel it). We can safely ++ * ignore the boosting request, as the idle CPU runs this code ++ * with interrupts disabled and will complete the lock ++ * protected section without being interrupted. So there is no ++ * real need to boost. ++ */ ++ if (unlikely(p == rq->idle)) { ++ WARN_ON(p != rq->curr); ++ WARN_ON(p->pi_blocked_on); ++ goto out_unlock; ++ } ++ ++ trace_sched_pi_setprio(p, pi_task); ++ p->prio = prio; ++ update_task_priodl(p); ++ ++ check_task_changed(rq, p); ++ ++out_unlock: ++ __task_access_unlock(p, lock); ++} ++#else ++static inline int rt_effective_prio(struct task_struct *p, int prio) ++{ ++ return prio; ++} ++#endif ++ ++void set_user_nice(struct task_struct *p, long nice) ++{ ++ int new_static; ++ unsigned long flags; ++ struct rq *rq; ++ raw_spinlock_t *lock; ++ ++ if (task_nice(p) == nice || nice < MIN_NICE || nice > MAX_NICE) ++ return; ++ new_static = NICE_TO_PRIO(nice); ++ /* ++ * We have to be careful, if called from sys_setpriority(), ++ * the task might be in the middle of scheduling on another CPU. ++ */ ++ raw_spin_lock_irqsave(&p->pi_lock, flags); ++ rq = __task_access_lock(p, &lock); ++ ++ /* rq lock may not held!! */ ++ update_rq_clock(rq); ++ ++ p->static_prio = new_static; ++ /* ++ * The RT priorities are set via sched_setscheduler(), but we still ++ * allow the 'normal' nice value to be set - but as expected ++ * it wont have any effect on scheduling until the task is ++ * not SCHED_NORMAL/SCHED_BATCH: ++ */ ++ if (task_has_rt_policy(p)) ++ goto out_unlock; ++ ++ p->deadline -= task_deadline_diff(p); ++ p->deadline += static_deadline_diff(new_static); ++ p->prio = effective_prio(p); ++ update_task_priodl(p); ++ ++ check_task_changed(rq, p); ++out_unlock: ++ __task_access_unlock(p, lock); ++ raw_spin_unlock_irqrestore(&p->pi_lock, flags); ++} ++EXPORT_SYMBOL(set_user_nice); ++ ++/* ++ * can_nice - check if a task can reduce its nice value ++ * @p: task ++ * @nice: nice value ++ */ ++int can_nice(const struct task_struct *p, const int nice) ++{ ++ /* Convert nice value [19,-20] to rlimit style value [1,40] */ ++ int nice_rlim = nice_to_rlimit(nice); ++ ++ return (nice_rlim <= task_rlimit(p, RLIMIT_NICE) || ++ capable(CAP_SYS_NICE)); ++} ++ ++#ifdef __ARCH_WANT_SYS_NICE ++ ++/* ++ * sys_nice - change the priority of the current process. ++ * @increment: priority increment ++ * ++ * sys_setpriority is a more generic, but much slower function that ++ * does similar things. ++ */ ++SYSCALL_DEFINE1(nice, int, increment) ++{ ++ long nice, retval; ++ ++ /* ++ * Setpriority might change our priority at the same moment. ++ * We don't have to worry. Conceptually one call occurs first ++ * and we have a single winner. ++ */ ++ ++ increment = clamp(increment, -NICE_WIDTH, NICE_WIDTH); ++ nice = task_nice(current) + increment; ++ ++ nice = clamp_val(nice, MIN_NICE, MAX_NICE); ++ if (increment < 0 && !can_nice(current, nice)) ++ return -EPERM; ++ ++ retval = security_task_setnice(current, nice); ++ if (retval) ++ return retval; ++ ++ set_user_nice(current, nice); ++ return 0; ++} ++ ++#endif ++ ++/** ++ * task_prio - return the priority value of a given task. ++ * @p: the task in question. ++ * ++ * Return: The priority value as seen by users in /proc. ++ * RT tasks are offset by -100. Normal tasks are centered around 1, value goes ++ * from 0(SCHED_ISO) up to 82 (nice +19 SCHED_IDLE). ++ */ ++int task_prio(const struct task_struct *p) ++{ ++ int level, prio = p->prio - MAX_RT_PRIO; ++ static const int level_to_nice_prio[] = {39, 33, 26, 20, 14, 7, 0, 0}; ++ ++ /* rt tasks */ ++ if (prio <= 0) ++ goto out; ++ ++ preempt_disable(); ++ level = task_deadline_level(p, this_rq()); ++ preempt_enable(); ++ prio += level_to_nice_prio[level]; ++ if (idleprio_task(p)) ++ prio += NICE_WIDTH; ++out: ++ return prio; ++} ++ ++/** ++ * idle_cpu - is a given CPU idle currently? ++ * @cpu: the processor in question. ++ * ++ * Return: 1 if the CPU is currently idle. 0 otherwise. ++ */ ++int idle_cpu(int cpu) ++{ ++ return cpu_curr(cpu) == cpu_rq(cpu)->idle; ++} ++ ++/** ++ * idle_task - return the idle task for a given CPU. ++ * @cpu: the processor in question. ++ * ++ * Return: The idle task for the cpu @cpu. ++ */ ++struct task_struct *idle_task(int cpu) ++{ ++ return cpu_rq(cpu)->idle; ++} ++ ++/** ++ * find_process_by_pid - find a process with a matching PID value. ++ * @pid: the pid in question. ++ * ++ * The task of @pid, if found. %NULL otherwise. ++ */ ++static inline struct task_struct *find_process_by_pid(pid_t pid) ++{ ++ return pid ? find_task_by_vpid(pid) : current; ++} ++ ++#ifdef CONFIG_SMP ++void sched_set_stop_task(int cpu, struct task_struct *stop) ++{ ++ struct sched_param stop_param = { .sched_priority = STOP_PRIO }; ++ struct sched_param start_param = { .sched_priority = 0 }; ++ struct task_struct *old_stop = cpu_rq(cpu)->stop; ++ ++ if (stop) { ++ /* ++ * Make it appear like a SCHED_FIFO task, its something ++ * userspace knows about and won't get confused about. ++ * ++ * Also, it will make PI more or less work without too ++ * much confusion -- but then, stop work should not ++ * rely on PI working anyway. ++ */ ++ sched_setscheduler_nocheck(stop, SCHED_FIFO, &stop_param); ++ } ++ ++ cpu_rq(cpu)->stop = stop; ++ ++ if (old_stop) { ++ /* ++ * Reset it back to a normal scheduling policy so that ++ * it can die in pieces. ++ */ ++ sched_setscheduler_nocheck(old_stop, SCHED_NORMAL, &start_param); ++ } ++} ++ ++/* ++ * Change a given task's CPU affinity. Migrate the thread to a ++ * proper CPU and schedule it away if the CPU it's executing on ++ * is removed from the allowed bitmask. ++ * ++ * NOTE: the caller must have a valid reference to the task, the ++ * task must not exit() & deallocate itself prematurely. The ++ * call is not atomic; no spinlocks may be held. ++ */ ++static int __set_cpus_allowed_ptr(struct task_struct *p, ++ const struct cpumask *new_mask, bool check) ++{ ++ const struct cpumask *cpu_valid_mask = cpu_active_mask; ++ int dest_cpu; ++ unsigned long flags; ++ struct rq *rq; ++ raw_spinlock_t *lock; ++ int ret = 0; ++ ++ raw_spin_lock_irqsave(&p->pi_lock, flags); ++ rq = __task_access_lock(p, &lock); ++ ++ if (p->flags & PF_KTHREAD) { ++ /* ++ * Kernel threads are allowed on online && !active CPUs ++ */ ++ cpu_valid_mask = cpu_online_mask; ++ } ++ ++ /* ++ * Must re-check here, to close a race against __kthread_bind(), ++ * sched_setaffinity() is not guaranteed to observe the flag. ++ */ ++ if (check && (p->flags & PF_NO_SETAFFINITY)) { ++ ret = -EINVAL; ++ goto out; ++ } ++ ++ if (cpumask_equal(&p->cpus_mask, new_mask)) ++ goto out; ++ ++ dest_cpu = cpumask_any_and(cpu_valid_mask, new_mask); ++ if (dest_cpu >= nr_cpu_ids) { ++ ret = -EINVAL; ++ goto out; ++ } ++ ++ do_set_cpus_allowed(p, new_mask); ++ ++ if (p->flags & PF_KTHREAD) { ++ /* ++ * For kernel threads that do indeed end up on online && ++ * !active we want to ensure they are strict per-CPU threads. ++ */ ++ WARN_ON(cpumask_intersects(new_mask, cpu_online_mask) && ++ !cpumask_intersects(new_mask, cpu_active_mask) && ++ p->nr_cpus_allowed != 1); ++ } ++ ++ /* Can the task run on the task's current CPU? If so, we're done */ ++ if (cpumask_test_cpu(task_cpu(p), new_mask)) ++ goto out; ++ ++ if (task_running(p) || p->state == TASK_WAKING) { ++ struct migration_arg arg = { p, dest_cpu }; ++ ++ /* Need help from migration thread: drop lock and wait. */ ++ __task_access_unlock(p, lock); ++ raw_spin_unlock_irqrestore(&p->pi_lock, flags); ++ stop_one_cpu(cpu_of(rq), migration_cpu_stop, &arg); ++ return 0; ++ } ++ if (task_on_rq_queued(p)) { ++ /* ++ * OK, since we're going to drop the lock immediately ++ * afterwards anyway. ++ */ ++ update_rq_clock(rq); ++ rq = move_queued_task(rq, p, dest_cpu); ++ lock = &rq->lock; ++ } ++ ++out: ++ __task_access_unlock(p, lock); ++ raw_spin_unlock_irqrestore(&p->pi_lock, flags); ++ ++ return ret; ++} ++ ++int set_cpus_allowed_ptr(struct task_struct *p, const struct cpumask *new_mask) ++{ ++ return __set_cpus_allowed_ptr(p, new_mask, false); ++} ++EXPORT_SYMBOL_GPL(set_cpus_allowed_ptr); ++ ++#else ++static inline int ++__set_cpus_allowed_ptr(struct task_struct *p, ++ const struct cpumask *new_mask, bool check) ++{ ++ return set_cpus_allowed_ptr(p, new_mask); ++} ++#endif ++ ++static u64 task_init_deadline(const struct task_struct *p) ++{ ++ return task_rq(p)->clock + task_deadline_diff(p); ++} ++ ++u64 (* task_init_deadline_func_tbl[])(const struct task_struct *p) = { ++ task_init_deadline, /* SCHED_NORMAL */ ++ NULL, /* SCHED_FIFO */ ++ NULL, /* SCHED_RR */ ++ task_init_deadline, /* SCHED_BATCH */ ++ NULL, /* SCHED_ISO */ ++ task_init_deadline /* SCHED_IDLE */ ++}; ++ ++/* ++ * sched_setparam() passes in -1 for its policy, to let the functions ++ * it calls know not to change it. ++ */ ++#define SETPARAM_POLICY -1 ++ ++static void __setscheduler_params(struct task_struct *p, ++ const struct sched_attr *attr) ++{ ++ int old_policy = p->policy; ++ int policy = attr->sched_policy; ++ ++ if (policy == SETPARAM_POLICY) ++ policy = p->policy; ++ ++ p->policy = policy; ++ ++ /* ++ * allow normal nice value to be set, but will not have any ++ * effect on scheduling until the task not SCHED_NORMAL/ ++ * SCHED_BATCH ++ */ ++ p->static_prio = NICE_TO_PRIO(attr->sched_nice); ++ ++ /* ++ * __sched_setscheduler() ensures attr->sched_priority == 0 when ++ * !rt_policy. Always setting this ensures that things like ++ * getparam()/getattr() don't report silly values for !rt tasks. ++ */ ++ p->rt_priority = attr->sched_priority; ++ p->normal_prio = normal_prio(p); ++ ++ if (old_policy != policy) ++ p->deadline = (task_init_deadline_func_tbl[p->policy])? ++ task_init_deadline_func_tbl[p->policy](p):0ULL; ++} ++ ++/* Actually do priority change: must hold rq lock. */ ++static void __setscheduler(struct rq *rq, struct task_struct *p, ++ const struct sched_attr *attr, bool keep_boost) ++{ ++ __setscheduler_params(p, attr); ++ ++ /* ++ * Keep a potential priority boosting if called from ++ * sched_setscheduler(). ++ */ ++ p->prio = normal_prio(p); ++ if (keep_boost) ++ p->prio = rt_effective_prio(p, p->prio); ++ update_task_priodl(p); ++} ++ ++/* ++ * check the target process has a UID that matches the current process's ++ */ ++static bool check_same_owner(struct task_struct *p) ++{ ++ const struct cred *cred = current_cred(), *pcred; ++ bool match; ++ ++ rcu_read_lock(); ++ pcred = __task_cred(p); ++ match = (uid_eq(cred->euid, pcred->euid) || ++ uid_eq(cred->euid, pcred->uid)); ++ rcu_read_unlock(); ++ return match; ++} ++ ++static int ++__sched_setscheduler(struct task_struct *p, ++ const struct sched_attr *attr, bool user, bool pi) ++{ ++ const struct sched_attr dl_squash_attr = { ++ .size = sizeof(struct sched_attr), ++ .sched_policy = SCHED_FIFO, ++ .sched_nice = 0, ++ .sched_priority = 99, ++ }; ++ int newprio = MAX_RT_PRIO - 1 - attr->sched_priority; ++ int retval, oldpolicy = -1; ++ int policy = attr->sched_policy; ++ unsigned long flags; ++ struct rq *rq; ++ int reset_on_fork; ++ raw_spinlock_t *lock; ++ ++ /* The pi code expects interrupts enabled */ ++ BUG_ON(pi && in_interrupt()); ++ ++ /* ++ * PDS supports SCHED_DEADLINE by squash it as prio 0 SCHED_FIFO ++ */ ++ if (unlikely(SCHED_DEADLINE == policy)) { ++ attr = &dl_squash_attr; ++ policy = attr->sched_policy; ++ newprio = MAX_RT_PRIO - 1 - attr->sched_priority; ++ } ++recheck: ++ /* Double check policy once rq lock held */ ++ if (policy < 0) { ++ reset_on_fork = p->sched_reset_on_fork; ++ policy = oldpolicy = p->policy; ++ } else { ++ reset_on_fork = !!(attr->sched_flags & SCHED_RESET_ON_FORK); ++ ++ if (policy > SCHED_IDLE) ++ return -EINVAL; ++ } ++ ++ if (attr->sched_flags & ~(SCHED_FLAG_ALL)) ++ return -EINVAL; ++ ++ /* ++ * Valid priorities for SCHED_FIFO and SCHED_RR are ++ * 1..MAX_USER_RT_PRIO-1, valid priority for SCHED_NORMAL and ++ * SCHED_BATCH and SCHED_IDLE is 0. ++ */ ++ if (attr->sched_priority < 0 || ++ (p->mm && attr->sched_priority > MAX_USER_RT_PRIO - 1) || ++ (!p->mm && attr->sched_priority > MAX_RT_PRIO - 1)) ++ return -EINVAL; ++ if ((SCHED_RR == policy || SCHED_FIFO == policy) != ++ (attr->sched_priority != 0)) ++ return -EINVAL; ++ ++ /* ++ * Allow unprivileged RT tasks to decrease priority: ++ */ ++ if (user && !capable(CAP_SYS_NICE)) { ++ if (SCHED_FIFO == policy || SCHED_RR == policy) { ++ unsigned long rlim_rtprio = ++ task_rlimit(p, RLIMIT_RTPRIO); ++ ++ /* Can't set/change the rt policy */ ++ if (policy != p->policy && !rlim_rtprio) ++ return -EPERM; ++ ++ /* Can't increase priority */ ++ if (attr->sched_priority > p->rt_priority && ++ attr->sched_priority > rlim_rtprio) ++ return -EPERM; ++ } ++ ++ /* Can't change other user's priorities */ ++ if (!check_same_owner(p)) ++ return -EPERM; ++ ++ /* Normal users shall not reset the sched_reset_on_fork flag */ ++ if (p->sched_reset_on_fork && !reset_on_fork) ++ return -EPERM; ++ } ++ ++ if (user) { ++ retval = security_task_setscheduler(p); ++ if (retval) ++ return retval; ++ } ++ ++ if (pi) ++ cpuset_read_lock(); ++ ++ /* ++ * Make sure no PI-waiters arrive (or leave) while we are ++ * changing the priority of the task: ++ */ ++ raw_spin_lock_irqsave(&p->pi_lock, flags); ++ ++ /* ++ * To be able to change p->policy safely, task_access_lock() ++ * must be called. ++ * IF use task_access_lock() here: ++ * For the task p which is not running, reading rq->stop is ++ * racy but acceptable as ->stop doesn't change much. ++ * An enhancemnet can be made to read rq->stop saftly. ++ */ ++ rq = __task_access_lock(p, &lock); ++ ++ /* ++ * Changing the policy of the stop threads its a very bad idea ++ */ ++ if (p == rq->stop) { ++ retval = -EINVAL; ++ goto unlock; ++ } ++ ++ /* ++ * If not changing anything there's no need to proceed further: ++ */ ++ if (unlikely(policy == p->policy)) { ++ if (rt_policy(policy) && attr->sched_priority != p->rt_priority) ++ goto change; ++ if (!rt_policy(policy) && ++ NICE_TO_PRIO(attr->sched_nice) != p->static_prio) ++ goto change; ++ ++ p->sched_reset_on_fork = reset_on_fork; ++ retval = 0; ++ goto unlock; ++ } ++change: ++ ++ /* Re-check policy now with rq lock held */ ++ if (unlikely(oldpolicy != -1 && oldpolicy != p->policy)) { ++ policy = oldpolicy = -1; ++ __task_access_unlock(p, lock); ++ raw_spin_unlock_irqrestore(&p->pi_lock, flags); ++ if (pi) ++ cpuset_read_unlock(); ++ goto recheck; ++ } ++ ++ p->sched_reset_on_fork = reset_on_fork; ++ ++ if (pi) { ++ /* ++ * Take priority boosted tasks into account. If the new ++ * effective priority is unchanged, we just store the new ++ * normal parameters and do not touch the scheduler class and ++ * the runqueue. This will be done when the task deboost ++ * itself. ++ */ ++ if (rt_effective_prio(p, newprio) == p->prio) { ++ __setscheduler_params(p, attr); ++ retval = 0; ++ goto unlock; ++ } ++ } ++ ++ __setscheduler(rq, p, attr, pi); ++ ++ check_task_changed(rq, p); ++ ++ /* Avoid rq from going away on us: */ ++ preempt_disable(); ++ __task_access_unlock(p, lock); ++ raw_spin_unlock_irqrestore(&p->pi_lock, flags); ++ ++ if (pi) { ++ cpuset_read_unlock(); ++ rt_mutex_adjust_pi(p); ++ } ++ ++ preempt_enable(); ++ ++ return 0; ++ ++unlock: ++ __task_access_unlock(p, lock); ++ raw_spin_unlock_irqrestore(&p->pi_lock, flags); ++ if (pi) ++ cpuset_read_unlock(); ++ return retval; ++} ++ ++static int _sched_setscheduler(struct task_struct *p, int policy, ++ const struct sched_param *param, bool check) ++{ ++ struct sched_attr attr = { ++ .sched_policy = policy, ++ .sched_priority = param->sched_priority, ++ .sched_nice = PRIO_TO_NICE(p->static_prio), ++ }; ++ ++ /* Fixup the legacy SCHED_RESET_ON_FORK hack. */ ++ if ((policy != SETPARAM_POLICY) && (policy & SCHED_RESET_ON_FORK)) { ++ attr.sched_flags |= SCHED_FLAG_RESET_ON_FORK; ++ policy &= ~SCHED_RESET_ON_FORK; ++ attr.sched_policy = policy; ++ } ++ ++ return __sched_setscheduler(p, &attr, check, true); ++} ++ ++/** ++ * sched_setscheduler - change the scheduling policy and/or RT priority of a thread. ++ * @p: the task in question. ++ * @policy: new policy. ++ * @param: structure containing the new RT priority. ++ * ++ * Return: 0 on success. An error code otherwise. ++ * ++ * NOTE that the task may be already dead. ++ */ ++int sched_setscheduler(struct task_struct *p, int policy, ++ const struct sched_param *param) ++{ ++ return _sched_setscheduler(p, policy, param, true); ++} ++ ++EXPORT_SYMBOL_GPL(sched_setscheduler); ++ ++int sched_setattr(struct task_struct *p, const struct sched_attr *attr) ++{ ++ return __sched_setscheduler(p, attr, true, true); ++} ++EXPORT_SYMBOL_GPL(sched_setattr); ++ ++int sched_setattr_nocheck(struct task_struct *p, const struct sched_attr *attr) ++{ ++ return __sched_setscheduler(p, attr, false, true); ++} ++ ++/** ++ * sched_setscheduler_nocheck - change the scheduling policy and/or RT priority of a thread from kernelspace. ++ * @p: the task in question. ++ * @policy: new policy. ++ * @param: structure containing the new RT priority. ++ * ++ * Just like sched_setscheduler, only don't bother checking if the ++ * current context has permission. For example, this is needed in ++ * stop_machine(): we create temporary high priority worker threads, ++ * but our caller might not have that capability. ++ * ++ * Return: 0 on success. An error code otherwise. ++ */ ++int sched_setscheduler_nocheck(struct task_struct *p, int policy, ++ const struct sched_param *param) ++{ ++ return _sched_setscheduler(p, policy, param, false); ++} ++EXPORT_SYMBOL_GPL(sched_setscheduler_nocheck); ++ ++static int ++do_sched_setscheduler(pid_t pid, int policy, struct sched_param __user *param) ++{ ++ struct sched_param lparam; ++ struct task_struct *p; ++ int retval; ++ ++ if (!param || pid < 0) ++ return -EINVAL; ++ if (copy_from_user(&lparam, param, sizeof(struct sched_param))) ++ return -EFAULT; ++ ++ rcu_read_lock(); ++ retval = -ESRCH; ++ p = find_process_by_pid(pid); ++ if (likely(p)) ++ get_task_struct(p); ++ rcu_read_unlock(); ++ ++ if (likely(p)) { ++ retval = sched_setscheduler(p, policy, &lparam); ++ put_task_struct(p); ++ } ++ ++ return retval; ++} ++ ++/* ++ * Mimics kernel/events/core.c perf_copy_attr(). ++ */ ++static int sched_copy_attr(struct sched_attr __user *uattr, struct sched_attr *attr) ++{ ++ u32 size; ++ int ret; ++ ++ /* Zero the full structure, so that a short copy will be nice: */ ++ memset(attr, 0, sizeof(*attr)); ++ ++ ret = get_user(size, &uattr->size); ++ if (ret) ++ return ret; ++ ++ /* ABI compatibility quirk: */ ++ if (!size) ++ size = SCHED_ATTR_SIZE_VER0; ++ ++ if (size < SCHED_ATTR_SIZE_VER0 || size > PAGE_SIZE) ++ goto err_size; ++ ++ ret = copy_struct_from_user(attr, sizeof(*attr), uattr, size); ++ if (ret) { ++ if (ret == -E2BIG) ++ goto err_size; ++ return ret; ++ } ++ ++ /* ++ * XXX: Do we want to be lenient like existing syscalls; or do we want ++ * to be strict and return an error on out-of-bounds values? ++ */ ++ attr->sched_nice = clamp(attr->sched_nice, -20, 19); ++ ++ /* sched/core.c uses zero here but we already know ret is zero */ ++ return 0; ++ ++err_size: ++ put_user(sizeof(*attr), &uattr->size); ++ return -E2BIG; ++} ++ ++/** ++ * sys_sched_setscheduler - set/change the scheduler policy and RT priority ++ * @pid: the pid in question. ++ * @policy: new policy. ++ * ++ * Return: 0 on success. An error code otherwise. ++ * @param: structure containing the new RT priority. ++ */ ++SYSCALL_DEFINE3(sched_setscheduler, pid_t, pid, int, policy, struct sched_param __user *, param) ++{ ++ if (policy < 0) ++ return -EINVAL; ++ ++ return do_sched_setscheduler(pid, policy, param); ++} ++ ++/** ++ * sys_sched_setparam - set/change the RT priority of a thread ++ * @pid: the pid in question. ++ * @param: structure containing the new RT priority. ++ * ++ * Return: 0 on success. An error code otherwise. ++ */ ++SYSCALL_DEFINE2(sched_setparam, pid_t, pid, struct sched_param __user *, param) ++{ ++ return do_sched_setscheduler(pid, SETPARAM_POLICY, param); ++} ++ ++/** ++ * sys_sched_setattr - same as above, but with extended sched_attr ++ * @pid: the pid in question. ++ * @uattr: structure containing the extended parameters. ++ */ ++SYSCALL_DEFINE3(sched_setattr, pid_t, pid, struct sched_attr __user *, uattr, ++ unsigned int, flags) ++{ ++ struct sched_attr attr; ++ struct task_struct *p; ++ int retval; ++ ++ if (!uattr || pid < 0 || flags) ++ return -EINVAL; ++ ++ retval = sched_copy_attr(uattr, &attr); ++ if (retval) ++ return retval; ++ ++ if ((int)attr.sched_policy < 0) ++ return -EINVAL; ++ ++ rcu_read_lock(); ++ retval = -ESRCH; ++ p = find_process_by_pid(pid); ++ if (p != NULL) ++ retval = sched_setattr(p, &attr); ++ rcu_read_unlock(); ++ ++ return retval; ++} ++ ++/** ++ * sys_sched_getscheduler - get the policy (scheduling class) of a thread ++ * @pid: the pid in question. ++ * ++ * Return: On success, the policy of the thread. Otherwise, a negative error ++ * code. ++ */ ++SYSCALL_DEFINE1(sched_getscheduler, pid_t, pid) ++{ ++ struct task_struct *p; ++ int retval = -EINVAL; ++ ++ if (pid < 0) ++ goto out_nounlock; ++ ++ retval = -ESRCH; ++ rcu_read_lock(); ++ p = find_process_by_pid(pid); ++ if (p) { ++ retval = security_task_getscheduler(p); ++ if (!retval) ++ retval = p->policy; ++ } ++ rcu_read_unlock(); ++ ++out_nounlock: ++ return retval; ++} ++ ++/** ++ * sys_sched_getscheduler - get the RT priority of a thread ++ * @pid: the pid in question. ++ * @param: structure containing the RT priority. ++ * ++ * Return: On success, 0 and the RT priority is in @param. Otherwise, an error ++ * code. ++ */ ++SYSCALL_DEFINE2(sched_getparam, pid_t, pid, struct sched_param __user *, param) ++{ ++ struct sched_param lp = { .sched_priority = 0 }; ++ struct task_struct *p; ++ int retval = -EINVAL; ++ ++ if (!param || pid < 0) ++ goto out_nounlock; ++ ++ rcu_read_lock(); ++ p = find_process_by_pid(pid); ++ retval = -ESRCH; ++ if (!p) ++ goto out_unlock; ++ ++ retval = security_task_getscheduler(p); ++ if (retval) ++ goto out_unlock; ++ ++ if (task_has_rt_policy(p)) ++ lp.sched_priority = p->rt_priority; ++ rcu_read_unlock(); ++ ++ /* ++ * This one might sleep, we cannot do it with a spinlock held ... ++ */ ++ retval = copy_to_user(param, &lp, sizeof(*param)) ? -EFAULT : 0; ++ ++out_nounlock: ++ return retval; ++ ++out_unlock: ++ rcu_read_unlock(); ++ return retval; ++} ++ ++/* ++ * Copy the kernel size attribute structure (which might be larger ++ * than what user-space knows about) to user-space. ++ * ++ * Note that all cases are valid: user-space buffer can be larger or ++ * smaller than the kernel-space buffer. The usual case is that both ++ * have the same size. ++ */ ++static int ++sched_attr_copy_to_user(struct sched_attr __user *uattr, ++ struct sched_attr *kattr, ++ unsigned int usize) ++{ ++ unsigned int ksize = sizeof(*kattr); ++ ++ if (!access_ok(uattr, usize)) ++ return -EFAULT; ++ ++ /* ++ * sched_getattr() ABI forwards and backwards compatibility: ++ * ++ * If usize == ksize then we just copy everything to user-space and all is good. ++ * ++ * If usize < ksize then we only copy as much as user-space has space for, ++ * this keeps ABI compatibility as well. We skip the rest. ++ * ++ * If usize > ksize then user-space is using a newer version of the ABI, ++ * which part the kernel doesn't know about. Just ignore it - tooling can ++ * detect the kernel's knowledge of attributes from the attr->size value ++ * which is set to ksize in this case. ++ */ ++ kattr->size = min(usize, ksize); ++ ++ if (copy_to_user(uattr, kattr, kattr->size)) ++ return -EFAULT; ++ ++ return 0; ++} ++ ++/** ++ * sys_sched_getattr - similar to sched_getparam, but with sched_attr ++ * @pid: the pid in question. ++ * @uattr: structure containing the extended parameters. ++ * @usize: sizeof(attr) for fwd/bwd comp. ++ * @flags: for future extension. ++ */ ++SYSCALL_DEFINE4(sched_getattr, pid_t, pid, struct sched_attr __user *, uattr, ++ unsigned int, usize, unsigned int, flags) ++{ ++ struct sched_attr kattr = { }; ++ struct task_struct *p; ++ int retval; ++ ++ if (!uattr || pid < 0 || usize > PAGE_SIZE || ++ usize < SCHED_ATTR_SIZE_VER0 || flags) ++ return -EINVAL; ++ ++ rcu_read_lock(); ++ p = find_process_by_pid(pid); ++ retval = -ESRCH; ++ if (!p) ++ goto out_unlock; ++ ++ retval = security_task_getscheduler(p); ++ if (retval) ++ goto out_unlock; ++ ++ kattr.sched_policy = p->policy; ++ if (rt_task(p)) ++ kattr.sched_priority = p->rt_priority; ++ else ++ kattr.sched_nice = task_nice(p); ++ ++#ifdef CONFIG_UCLAMP_TASK ++ kattr.sched_util_min = p->uclamp_req[UCLAMP_MIN].value; ++ kattr.sched_util_max = p->uclamp_req[UCLAMP_MAX].value; ++#endif ++ ++ rcu_read_unlock(); ++ ++ return sched_attr_copy_to_user(uattr, &kattr, usize); ++ ++out_unlock: ++ rcu_read_unlock(); ++ return retval; ++} ++ ++long sched_setaffinity(pid_t pid, const struct cpumask *in_mask) ++{ ++ cpumask_var_t cpus_mask, new_mask; ++ struct task_struct *p; ++ int retval; ++ ++ get_online_cpus(); ++ rcu_read_lock(); ++ ++ p = find_process_by_pid(pid); ++ if (!p) { ++ rcu_read_unlock(); ++ put_online_cpus(); ++ return -ESRCH; ++ } ++ ++ /* Prevent p going away */ ++ get_task_struct(p); ++ rcu_read_unlock(); ++ ++ if (p->flags & PF_NO_SETAFFINITY) { ++ retval = -EINVAL; ++ goto out_put_task; ++ } ++ if (!alloc_cpumask_var(&cpus_mask, GFP_KERNEL)) { ++ retval = -ENOMEM; ++ goto out_put_task; ++ } ++ if (!alloc_cpumask_var(&new_mask, GFP_KERNEL)) { ++ retval = -ENOMEM; ++ goto out_free_cpus_allowed; ++ } ++ retval = -EPERM; ++ if (!check_same_owner(p)) { ++ rcu_read_lock(); ++ if (!ns_capable(__task_cred(p)->user_ns, CAP_SYS_NICE)) { ++ rcu_read_unlock(); ++ goto out_unlock; ++ } ++ rcu_read_unlock(); ++ } ++ ++ retval = security_task_setscheduler(p); ++ if (retval) ++ goto out_unlock; ++ ++ cpuset_cpus_allowed(p, cpus_mask); ++ cpumask_and(new_mask, in_mask, cpus_mask); ++again: ++ retval = __set_cpus_allowed_ptr(p, new_mask, true); ++ ++ if (!retval) { ++ cpuset_cpus_allowed(p, cpus_mask); ++ if (!cpumask_subset(new_mask, cpus_mask)) { ++ /* ++ * We must have raced with a concurrent cpuset ++ * update. Just reset the cpus_mask to the ++ * cpuset's cpus_mask ++ */ ++ cpumask_copy(new_mask, cpus_mask); ++ goto again; ++ } ++ } ++out_unlock: ++ free_cpumask_var(new_mask); ++out_free_cpus_allowed: ++ free_cpumask_var(cpus_mask); ++out_put_task: ++ put_task_struct(p); ++ put_online_cpus(); ++ return retval; ++} ++ ++static int get_user_cpu_mask(unsigned long __user *user_mask_ptr, unsigned len, ++ struct cpumask *new_mask) ++{ ++ if (len < cpumask_size()) ++ cpumask_clear(new_mask); ++ else if (len > cpumask_size()) ++ len = cpumask_size(); ++ ++ return copy_from_user(new_mask, user_mask_ptr, len) ? -EFAULT : 0; ++} ++ ++/** ++ * sys_sched_setaffinity - set the CPU affinity of a process ++ * @pid: pid of the process ++ * @len: length in bytes of the bitmask pointed to by user_mask_ptr ++ * @user_mask_ptr: user-space pointer to the new CPU mask ++ * ++ * Return: 0 on success. An error code otherwise. ++ */ ++SYSCALL_DEFINE3(sched_setaffinity, pid_t, pid, unsigned int, len, ++ unsigned long __user *, user_mask_ptr) ++{ ++ cpumask_var_t new_mask; ++ int retval; ++ ++ if (!alloc_cpumask_var(&new_mask, GFP_KERNEL)) ++ return -ENOMEM; ++ ++ retval = get_user_cpu_mask(user_mask_ptr, len, new_mask); ++ if (retval == 0) ++ retval = sched_setaffinity(pid, new_mask); ++ free_cpumask_var(new_mask); ++ return retval; ++} ++ ++long sched_getaffinity(pid_t pid, cpumask_t *mask) ++{ ++ struct task_struct *p; ++ raw_spinlock_t *lock; ++ unsigned long flags; ++ int retval; ++ ++ rcu_read_lock(); ++ ++ retval = -ESRCH; ++ p = find_process_by_pid(pid); ++ if (!p) ++ goto out_unlock; ++ ++ retval = security_task_getscheduler(p); ++ if (retval) ++ goto out_unlock; ++ ++ task_access_lock_irqsave(p, &lock, &flags); ++ cpumask_and(mask, &p->cpus_mask, cpu_active_mask); ++ task_access_unlock_irqrestore(p, lock, &flags); ++ ++out_unlock: ++ rcu_read_unlock(); ++ ++ return retval; ++} ++ ++/** ++ * sys_sched_getaffinity - get the CPU affinity of a process ++ * @pid: pid of the process ++ * @len: length in bytes of the bitmask pointed to by user_mask_ptr ++ * @user_mask_ptr: user-space pointer to hold the current CPU mask ++ * ++ * Return: size of CPU mask copied to user_mask_ptr on success. An ++ * error code otherwise. ++ */ ++SYSCALL_DEFINE3(sched_getaffinity, pid_t, pid, unsigned int, len, ++ unsigned long __user *, user_mask_ptr) ++{ ++ int ret; ++ cpumask_var_t mask; ++ ++ if ((len * BITS_PER_BYTE) < nr_cpu_ids) ++ return -EINVAL; ++ if (len & (sizeof(unsigned long)-1)) ++ return -EINVAL; ++ ++ if (!alloc_cpumask_var(&mask, GFP_KERNEL)) ++ return -ENOMEM; ++ ++ ret = sched_getaffinity(pid, mask); ++ if (ret == 0) { ++ unsigned int retlen = min_t(size_t, len, cpumask_size()); ++ ++ if (copy_to_user(user_mask_ptr, mask, retlen)) ++ ret = -EFAULT; ++ else ++ ret = retlen; ++ } ++ free_cpumask_var(mask); ++ ++ return ret; ++} ++ ++/** ++ * sys_sched_yield - yield the current processor to other threads. ++ * ++ * This function yields the current CPU to other tasks. It does this by ++ * scheduling away the current task. If it still has the earliest deadline ++ * it will be scheduled again as the next task. ++ * ++ * Return: 0. ++ */ ++static void do_sched_yield(void) ++{ ++ struct rq *rq; ++ struct rq_flags rf; ++ ++ if (!sched_yield_type) ++ return; ++ ++ rq = this_rq_lock_irq(&rf); ++ ++ if (sched_yield_type > 1) { ++ time_slice_expired(current, rq); ++ requeue_task(current, rq); ++ } ++ schedstat_inc(rq->yld_count); ++ ++ /* ++ * Since we are going to call schedule() anyway, there's ++ * no need to preempt or enable interrupts: ++ */ ++ preempt_disable(); ++ raw_spin_unlock(&rq->lock); ++ sched_preempt_enable_no_resched(); ++ ++ schedule(); ++} ++ ++SYSCALL_DEFINE0(sched_yield) ++{ ++ do_sched_yield(); ++ return 0; ++} ++ ++#ifndef CONFIG_PREEMPTION ++int __sched _cond_resched(void) ++{ ++ if (should_resched(0)) { ++ preempt_schedule_common(); ++ return 1; ++ } ++ rcu_all_qs(); ++ return 0; ++} ++EXPORT_SYMBOL(_cond_resched); ++#endif ++ ++/* ++ * __cond_resched_lock() - if a reschedule is pending, drop the given lock, ++ * call schedule, and on return reacquire the lock. ++ * ++ * This works OK both with and without CONFIG_PREEMPTION. We do strange low-level ++ * operations here to prevent schedule() from being called twice (once via ++ * spin_unlock(), once by hand). ++ */ ++int __cond_resched_lock(spinlock_t *lock) ++{ ++ int resched = should_resched(PREEMPT_LOCK_OFFSET); ++ int ret = 0; ++ ++ lockdep_assert_held(lock); ++ ++ if (spin_needbreak(lock) || resched) { ++ spin_unlock(lock); ++ if (resched) ++ preempt_schedule_common(); ++ else ++ cpu_relax(); ++ ret = 1; ++ spin_lock(lock); ++ } ++ return ret; ++} ++EXPORT_SYMBOL(__cond_resched_lock); ++ ++/** ++ * yield - yield the current processor to other threads. ++ * ++ * Do not ever use this function, there's a 99% chance you're doing it wrong. ++ * ++ * The scheduler is at all times free to pick the calling task as the most ++ * eligible task to run, if removing the yield() call from your code breaks ++ * it, its already broken. ++ * ++ * Typical broken usage is: ++ * ++ * while (!event) ++ * yield(); ++ * ++ * where one assumes that yield() will let 'the other' process run that will ++ * make event true. If the current task is a SCHED_FIFO task that will never ++ * happen. Never use yield() as a progress guarantee!! ++ * ++ * If you want to use yield() to wait for something, use wait_event(). ++ * If you want to use yield() to be 'nice' for others, use cond_resched(). ++ * If you still want to use yield(), do not! ++ */ ++void __sched yield(void) ++{ ++ set_current_state(TASK_RUNNING); ++ do_sched_yield(); ++} ++EXPORT_SYMBOL(yield); ++ ++/** ++ * yield_to - yield the current processor to another thread in ++ * your thread group, or accelerate that thread toward the ++ * processor it's on. ++ * @p: target task ++ * @preempt: whether task preemption is allowed or not ++ * ++ * It's the caller's job to ensure that the target task struct ++ * can't go away on us before we can do any checks. ++ * ++ * In PDS, yield_to is not supported. ++ * ++ * Return: ++ * true (>0) if we indeed boosted the target task. ++ * false (0) if we failed to boost the target. ++ * -ESRCH if there's no task to yield to. ++ */ ++int __sched yield_to(struct task_struct *p, bool preempt) ++{ ++ return 0; ++} ++EXPORT_SYMBOL_GPL(yield_to); ++ ++int io_schedule_prepare(void) ++{ ++ int old_iowait = current->in_iowait; ++ ++ current->in_iowait = 1; ++ blk_schedule_flush_plug(current); ++ ++ return old_iowait; ++} ++ ++void io_schedule_finish(int token) ++{ ++ current->in_iowait = token; ++} ++ ++/* ++ * This task is about to go to sleep on IO. Increment rq->nr_iowait so ++ * that process accounting knows that this is a task in IO wait state. ++ * ++ * But don't do that if it is a deliberate, throttling IO wait (this task ++ * has set its backing_dev_info: the queue against which it should throttle) ++ */ ++ ++long __sched io_schedule_timeout(long timeout) ++{ ++ int token; ++ long ret; ++ ++ token = io_schedule_prepare(); ++ ret = schedule_timeout(timeout); ++ io_schedule_finish(token); ++ ++ return ret; ++} ++EXPORT_SYMBOL(io_schedule_timeout); ++ ++void io_schedule(void) ++{ ++ int token; ++ ++ token = io_schedule_prepare(); ++ schedule(); ++ io_schedule_finish(token); ++} ++EXPORT_SYMBOL(io_schedule); ++ ++/** ++ * sys_sched_get_priority_max - return maximum RT priority. ++ * @policy: scheduling class. ++ * ++ * Return: On success, this syscall returns the maximum ++ * rt_priority that can be used by a given scheduling class. ++ * On failure, a negative error code is returned. ++ */ ++SYSCALL_DEFINE1(sched_get_priority_max, int, policy) ++{ ++ int ret = -EINVAL; ++ ++ switch (policy) { ++ case SCHED_FIFO: ++ case SCHED_RR: ++ ret = MAX_USER_RT_PRIO-1; ++ break; ++ case SCHED_NORMAL: ++ case SCHED_BATCH: ++ case SCHED_ISO: ++ case SCHED_IDLE: ++ ret = 0; ++ break; ++ } ++ return ret; ++} ++ ++/** ++ * sys_sched_get_priority_min - return minimum RT priority. ++ * @policy: scheduling class. ++ * ++ * Return: On success, this syscall returns the minimum ++ * rt_priority that can be used by a given scheduling class. ++ * On failure, a negative error code is returned. ++ */ ++SYSCALL_DEFINE1(sched_get_priority_min, int, policy) ++{ ++ int ret = -EINVAL; ++ ++ switch (policy) { ++ case SCHED_FIFO: ++ case SCHED_RR: ++ ret = 1; ++ break; ++ case SCHED_NORMAL: ++ case SCHED_BATCH: ++ case SCHED_ISO: ++ case SCHED_IDLE: ++ ret = 0; ++ break; ++ } ++ return ret; ++} ++ ++static int sched_rr_get_interval(pid_t pid, struct timespec64 *t) ++{ ++ struct task_struct *p; ++ int retval; ++ ++ if (pid < 0) ++ return -EINVAL; ++ ++ retval = -ESRCH; ++ rcu_read_lock(); ++ p = find_process_by_pid(pid); ++ if (!p) ++ goto out_unlock; ++ ++ retval = security_task_getscheduler(p); ++ if (retval) ++ goto out_unlock; ++ rcu_read_unlock(); ++ ++ *t = ns_to_timespec64(MS_TO_NS(rr_interval)); ++ return 0; ++ ++out_unlock: ++ rcu_read_unlock(); ++ return retval; ++} ++ ++/** ++ * sys_sched_rr_get_interval - return the default timeslice of a process. ++ * @pid: pid of the process. ++ * @interval: userspace pointer to the timeslice value. ++ * ++ * ++ * Return: On success, 0 and the timeslice is in @interval. Otherwise, ++ * an error code. ++ */ ++SYSCALL_DEFINE2(sched_rr_get_interval, pid_t, pid, ++ struct __kernel_timespec __user *, interval) ++{ ++ struct timespec64 t; ++ int retval = sched_rr_get_interval(pid, &t); ++ ++ if (retval == 0) ++ retval = put_timespec64(&t, interval); ++ ++ return retval; ++} ++ ++#ifdef CONFIG_COMPAT_32BIT_TIME ++SYSCALL_DEFINE2(sched_rr_get_interval_time32, pid_t, pid, ++ struct old_timespec32 __user *, interval) ++{ ++ struct timespec64 t; ++ int retval = sched_rr_get_interval(pid, &t); ++ ++ if (retval == 0) ++ retval = put_old_timespec32(&t, interval); ++ return retval; ++} ++#endif ++ ++void sched_show_task(struct task_struct *p) ++{ ++ unsigned long free = 0; ++ int ppid; ++ ++ if (!try_get_task_stack(p)) ++ return; ++ ++ printk(KERN_INFO "%-15.15s %c", p->comm, task_state_to_char(p)); ++ ++ if (p->state == TASK_RUNNING) ++ printk(KERN_CONT " running task "); ++#ifdef CONFIG_DEBUG_STACK_USAGE ++ free = stack_not_used(p); ++#endif ++ ppid = 0; ++ rcu_read_lock(); ++ if (pid_alive(p)) ++ ppid = task_pid_nr(rcu_dereference(p->real_parent)); ++ rcu_read_unlock(); ++ printk(KERN_CONT "%5lu %5d %6d 0x%08lx\n", free, ++ task_pid_nr(p), ppid, ++ (unsigned long)task_thread_info(p)->flags); ++ ++ print_worker_info(KERN_INFO, p); ++ show_stack(p, NULL); ++ put_task_stack(p); ++} ++EXPORT_SYMBOL_GPL(sched_show_task); ++ ++static inline bool ++state_filter_match(unsigned long state_filter, struct task_struct *p) ++{ ++ /* no filter, everything matches */ ++ if (!state_filter) ++ return true; ++ ++ /* filter, but doesn't match */ ++ if (!(p->state & state_filter)) ++ return false; ++ ++ /* ++ * When looking for TASK_UNINTERRUPTIBLE skip TASK_IDLE (allows ++ * TASK_KILLABLE). ++ */ ++ if (state_filter == TASK_UNINTERRUPTIBLE && p->state == TASK_IDLE) ++ return false; ++ ++ return true; ++} ++ ++ ++void show_state_filter(unsigned long state_filter) ++{ ++ struct task_struct *g, *p; ++ ++#if BITS_PER_LONG == 32 ++ printk(KERN_INFO ++ " task PC stack pid father\n"); ++#else ++ printk(KERN_INFO ++ " task PC stack pid father\n"); ++#endif ++ rcu_read_lock(); ++ for_each_process_thread(g, p) { ++ /* ++ * reset the NMI-timeout, listing all files on a slow ++ * console might take a lot of time: ++ * Also, reset softlockup watchdogs on all CPUs, because ++ * another CPU might be blocked waiting for us to process ++ * an IPI. ++ */ ++ touch_nmi_watchdog(); ++ touch_all_softlockup_watchdogs(); ++ if (state_filter_match(state_filter, p)) ++ sched_show_task(p); ++ } ++ ++#ifdef CONFIG_SCHED_DEBUG ++ /* PDS TODO: should support this ++ if (!state_filter) ++ sysrq_sched_debug_show(); ++ */ ++#endif ++ rcu_read_unlock(); ++ /* ++ * Only show locks if all tasks are dumped: ++ */ ++ if (!state_filter) ++ debug_show_all_locks(); ++} ++ ++void dump_cpu_task(int cpu) ++{ ++ pr_info("Task dump for CPU %d:\n", cpu); ++ sched_show_task(cpu_curr(cpu)); ++} ++ ++/** ++ * init_idle - set up an idle thread for a given CPU ++ * @idle: task in question ++ * @cpu: cpu the idle task belongs to ++ * ++ * NOTE: this function does not set the idle thread's NEED_RESCHED ++ * flag, to make booting more robust. ++ */ ++void init_idle(struct task_struct *idle, int cpu) ++{ ++ struct rq *rq = cpu_rq(cpu); ++ unsigned long flags; ++ ++ raw_spin_lock_irqsave(&idle->pi_lock, flags); ++ raw_spin_lock(&rq->lock); ++ update_rq_clock(rq); ++ ++ idle->last_ran = rq->clock_task; ++ idle->state = TASK_RUNNING; ++ idle->flags |= PF_IDLE; ++ /* Setting prio to illegal value shouldn't matter when never queued */ ++ idle->prio = PRIO_LIMIT; ++ idle->deadline = rq_clock(rq) + task_deadline_diff(idle); ++ update_task_priodl(idle); ++ ++ kasan_unpoison_task_stack(idle); ++ ++#ifdef CONFIG_SMP ++ /* ++ * It's possible that init_idle() gets called multiple times on a task, ++ * in that case do_set_cpus_allowed() will not do the right thing. ++ * ++ * And since this is boot we can forgo the serialisation. ++ */ ++ set_cpus_allowed_common(idle, cpumask_of(cpu)); ++#endif ++ ++ /* Silence PROVE_RCU */ ++ rcu_read_lock(); ++ __set_task_cpu(idle, cpu); ++ rcu_read_unlock(); ++ ++ rq->idle = idle; ++ rcu_assign_pointer(rq->curr, idle); ++ idle->on_cpu = 1; ++ ++ raw_spin_unlock(&rq->lock); ++ raw_spin_unlock_irqrestore(&idle->pi_lock, flags); ++ ++ /* Set the preempt count _outside_ the spinlocks! */ ++ init_idle_preempt_count(idle, cpu); ++ ++ ftrace_graph_init_idle_task(idle, cpu); ++ vtime_init_idle(idle, cpu); ++#ifdef CONFIG_SMP ++ sprintf(idle->comm, "%s/%d", INIT_TASK_COMM, cpu); ++#endif ++} ++ ++void resched_cpu(int cpu) ++{ ++ struct rq *rq = cpu_rq(cpu); ++ unsigned long flags; ++ ++ raw_spin_lock_irqsave(&rq->lock, flags); ++ if (cpu_online(cpu) || cpu == smp_processor_id()) ++ resched_curr(cpu_rq(cpu)); ++ raw_spin_unlock_irqrestore(&rq->lock, flags); ++} ++ ++static bool __wake_q_add(struct wake_q_head *head, struct task_struct *task) ++{ ++ struct wake_q_node *node = &task->wake_q; ++ ++ /* ++ * Atomically grab the task, if ->wake_q is !nil already it means ++ * its already queued (either by us or someone else) and will get the ++ * wakeup due to that. ++ * ++ * In order to ensure that a pending wakeup will observe our pending ++ * state, even in the failed case, an explicit smp_mb() must be used. ++ */ ++ smp_mb__before_atomic(); ++ if (unlikely(cmpxchg_relaxed(&node->next, NULL, WAKE_Q_TAIL))) ++ return false; ++ ++ /* ++ * The head is context local, there can be no concurrency. ++ */ ++ *head->lastp = node; ++ head->lastp = &node->next; ++ return true; ++} ++ ++/** ++ * wake_q_add() - queue a wakeup for 'later' waking. ++ * @head: the wake_q_head to add @task to ++ * @task: the task to queue for 'later' wakeup ++ * ++ * Queue a task for later wakeup, most likely by the wake_up_q() call in the ++ * same context, _HOWEVER_ this is not guaranteed, the wakeup can come ++ * instantly. ++ * ++ * This function must be used as-if it were wake_up_process(); IOW the task ++ * must be ready to be woken at this location. ++ */ ++void wake_q_add(struct wake_q_head *head, struct task_struct *task) ++{ ++ if (__wake_q_add(head, task)) ++ get_task_struct(task); ++} ++ ++/** ++ * wake_q_add_safe() - safely queue a wakeup for 'later' waking. ++ * @head: the wake_q_head to add @task to ++ * @task: the task to queue for 'later' wakeup ++ * ++ * Queue a task for later wakeup, most likely by the wake_up_q() call in the ++ * same context, _HOWEVER_ this is not guaranteed, the wakeup can come ++ * instantly. ++ * ++ * This function must be used as-if it were wake_up_process(); IOW the task ++ * must be ready to be woken at this location. ++ * ++ * This function is essentially a task-safe equivalent to wake_q_add(). Callers ++ * that already hold reference to @task can call the 'safe' version and trust ++ * wake_q to do the right thing depending whether or not the @task is already ++ * queued for wakeup. ++ */ ++void wake_q_add_safe(struct wake_q_head *head, struct task_struct *task) ++{ ++ if (!__wake_q_add(head, task)) ++ put_task_struct(task); ++} ++ ++void wake_up_q(struct wake_q_head *head) ++{ ++ struct wake_q_node *node = head->first; ++ ++ while (node != WAKE_Q_TAIL) { ++ struct task_struct *task; ++ ++ task = container_of(node, struct task_struct, wake_q); ++ BUG_ON(!task); ++ /* task can safely be re-inserted now: */ ++ node = node->next; ++ task->wake_q.next = NULL; ++ ++ /* ++ * wake_up_process() executes a full barrier, which pairs with ++ * the queueing in wake_q_add() so as not to miss wakeups. ++ */ ++ wake_up_process(task); ++ put_task_struct(task); ++ } ++} ++ ++#ifdef CONFIG_SMP ++ ++int cpuset_cpumask_can_shrink(const struct cpumask __maybe_unused *cur, ++ const struct cpumask __maybe_unused *trial) ++{ ++ return 1; ++} ++ ++int task_can_attach(struct task_struct *p, ++ const struct cpumask *cs_cpus_allowed) ++{ ++ int ret = 0; ++ ++ /* ++ * Kthreads which disallow setaffinity shouldn't be moved ++ * to a new cpuset; we don't want to change their CPU ++ * affinity and isolating such threads by their set of ++ * allowed nodes is unnecessary. Thus, cpusets are not ++ * applicable for such threads. This prevents checking for ++ * success of set_cpus_allowed_ptr() on all attached tasks ++ * before cpus_mask may be changed. ++ */ ++ if (p->flags & PF_NO_SETAFFINITY) ++ ret = -EINVAL; ++ ++ return ret; ++} ++ ++static bool sched_smp_initialized __read_mostly; ++ ++#ifdef CONFIG_NO_HZ_COMMON ++void nohz_balance_enter_idle(int cpu) ++{ ++} ++ ++void select_nohz_load_balancer(int stop_tick) ++{ ++} ++ ++void set_cpu_sd_state_idle(void) {} ++ ++/* ++ * In the semi idle case, use the nearest busy CPU for migrating timers ++ * from an idle CPU. This is good for power-savings. ++ * ++ * We don't do similar optimization for completely idle system, as ++ * selecting an idle CPU will add more delays to the timers than intended ++ * (as that CPU's timer base may not be uptodate wrt jiffies etc). ++ */ ++int get_nohz_timer_target(void) ++{ ++ int i, cpu = smp_processor_id(); ++ struct cpumask *mask; ++ ++ if (!idle_cpu(cpu) && housekeeping_cpu(cpu, HK_FLAG_TIMER)) ++ return cpu; ++ ++ for (mask = &(per_cpu(sched_cpu_affinity_chk_masks, cpu)[0]); ++ mask < per_cpu(sched_cpu_affinity_chk_end_masks, cpu); mask++) ++ for_each_cpu(i, mask) ++ if (!idle_cpu(i) && housekeeping_cpu(i, HK_FLAG_TIMER)) ++ return i; ++ ++ if (!housekeeping_cpu(cpu, HK_FLAG_TIMER)) ++ cpu = housekeeping_any_cpu(HK_FLAG_TIMER); ++ ++ return cpu; ++} ++ ++/* ++ * When add_timer_on() enqueues a timer into the timer wheel of an ++ * idle CPU then this timer might expire before the next timer event ++ * which is scheduled to wake up that CPU. In case of a completely ++ * idle system the next event might even be infinite time into the ++ * future. wake_up_idle_cpu() ensures that the CPU is woken up and ++ * leaves the inner idle loop so the newly added timer is taken into ++ * account when the CPU goes back to idle and evaluates the timer ++ * wheel for the next timer event. ++ */ ++void wake_up_idle_cpu(int cpu) ++{ ++ if (cpu == smp_processor_id()) ++ return; ++ ++ set_tsk_need_resched(cpu_rq(cpu)->idle); ++ smp_send_reschedule(cpu); ++} ++ ++void wake_up_nohz_cpu(int cpu) ++{ ++ wake_up_idle_cpu(cpu); ++} ++#endif /* CONFIG_NO_HZ_COMMON */ ++ ++#ifdef CONFIG_HOTPLUG_CPU ++/* ++ * Ensures that the idle task is using init_mm right before its CPU goes ++ * offline. ++ */ ++void idle_task_exit(void) ++{ ++ struct mm_struct *mm = current->active_mm; ++ ++ BUG_ON(cpu_online(smp_processor_id())); ++ ++ if (mm != &init_mm) { ++ switch_mm(mm, &init_mm, current); ++ current->active_mm = &init_mm; ++ finish_arch_post_lock_switch(); ++ } ++ mmdrop(mm); ++} ++ ++/* ++ * Migrate all tasks from the rq, sleeping tasks will be migrated by ++ * try_to_wake_up()->select_task_rq(). ++ * ++ * Called with rq->lock held even though we'er in stop_machine() and ++ * there's no concurrency possible, we hold the required locks anyway ++ * because of lock validation efforts. ++ */ ++static void migrate_tasks(struct rq *dead_rq) ++{ ++ struct rq *rq = dead_rq; ++ struct task_struct *p, *stop = rq->stop; ++ struct skiplist_node *node; ++ int count = 0; ++ ++ /* ++ * Fudge the rq selection such that the below task selection loop ++ * doesn't get stuck on the currently eligible stop task. ++ * ++ * We're currently inside stop_machine() and the rq is either stuck ++ * in the stop_machine_cpu_stop() loop, or we're executing this code, ++ * either way we should never end up calling schedule() until we're ++ * done here. ++ */ ++ rq->stop = NULL; ++ ++ node = &rq->sl_header; ++ while ((node = node->next[0]) != &rq->sl_header) { ++ int dest_cpu; ++ ++ p = skiplist_entry(node, struct task_struct, sl_node); ++ ++ /* skip the running task */ ++ if (task_running(p)) ++ continue; ++ ++ /* ++ * Rules for changing task_struct::cpus_mask are holding ++ * both pi_lock and rq->lock, such that holding either ++ * stabilizes the mask. ++ * ++ * Drop rq->lock is not quite as disastrous as it usually is ++ * because !cpu_active at this point, which means load-balance ++ * will not interfere. Also, stop-machine. ++ */ ++ raw_spin_unlock(&rq->lock); ++ raw_spin_lock(&p->pi_lock); ++ raw_spin_lock(&rq->lock); ++ ++ /* ++ * Since we're inside stop-machine, _nothing_ should have ++ * changed the task, WARN if weird stuff happened, because in ++ * that case the above rq->lock drop is a fail too. ++ */ ++ if (WARN_ON(task_rq(p) != rq || !task_on_rq_queued(p))) { ++ raw_spin_unlock(&p->pi_lock); ++ continue; ++ } ++ ++ count++; ++ /* Find suitable destination for @next, with force if needed. */ ++ dest_cpu = select_fallback_rq(dead_rq->cpu, p); ++ ++ rq = __migrate_task(rq, p, dest_cpu); ++ raw_spin_unlock(&rq->lock); ++ raw_spin_unlock(&p->pi_lock); ++ ++ rq = dead_rq; ++ raw_spin_lock(&rq->lock); ++ /* Check queued task all over from the header again */ ++ node = &rq->sl_header; ++ } ++ ++ rq->stop = stop; ++} ++ ++static void set_rq_offline(struct rq *rq) ++{ ++ if (rq->online) ++ rq->online = false; ++} ++#endif /* CONFIG_HOTPLUG_CPU */ ++ ++static void set_rq_online(struct rq *rq) ++{ ++ if (!rq->online) ++ rq->online = true; ++} ++ ++#ifdef CONFIG_SCHED_DEBUG ++ ++static __read_mostly int sched_debug_enabled; ++ ++static int __init sched_debug_setup(char *str) ++{ ++ sched_debug_enabled = 1; ++ ++ return 0; ++} ++early_param("sched_debug", sched_debug_setup); ++ ++static inline bool sched_debug(void) ++{ ++ return sched_debug_enabled; ++} ++#else /* !CONFIG_SCHED_DEBUG */ ++static inline bool sched_debug(void) ++{ ++ return false; ++} ++#endif /* CONFIG_SCHED_DEBUG */ ++ ++#ifdef CONFIG_SMP ++void scheduler_ipi(void) ++{ ++ /* ++ * Fold TIF_NEED_RESCHED into the preempt_count; anybody setting ++ * TIF_NEED_RESCHED remotely (for the first time) will also send ++ * this IPI. ++ */ ++ preempt_fold_need_resched(); ++ ++ if (!idle_cpu(smp_processor_id()) || need_resched()) ++ return; ++ ++ irq_enter(); ++ irq_exit(); ++} ++ ++void wake_up_if_idle(int cpu) ++{ ++ struct rq *rq = cpu_rq(cpu); ++ unsigned long flags; ++ ++ rcu_read_lock(); ++ ++ if (!is_idle_task(rcu_dereference(rq->curr))) ++ goto out; ++ ++ if (set_nr_if_polling(rq->idle)) { ++ trace_sched_wake_idle_without_ipi(cpu); ++ } else { ++ raw_spin_lock_irqsave(&rq->lock, flags); ++ if (is_idle_task(rq->curr)) ++ smp_send_reschedule(cpu); ++ /* Else CPU is not idle, do nothing here */ ++ raw_spin_unlock_irqrestore(&rq->lock, flags); ++ } ++ ++out: ++ rcu_read_unlock(); ++} ++ ++bool cpus_share_cache(int this_cpu, int that_cpu) ++{ ++ return per_cpu(sd_llc_id, this_cpu) == per_cpu(sd_llc_id, that_cpu); ++} ++#endif /* CONFIG_SMP */ ++ ++/* ++ * Topology list, bottom-up. ++ */ ++static struct sched_domain_topology_level default_topology[] = { ++#ifdef CONFIG_SCHED_SMT ++ { cpu_smt_mask, cpu_smt_flags, SD_INIT_NAME(SMT) }, ++#endif ++#ifdef CONFIG_SCHED_MC ++ { cpu_coregroup_mask, cpu_core_flags, SD_INIT_NAME(MC) }, ++#endif ++ { cpu_cpu_mask, SD_INIT_NAME(DIE) }, ++ { NULL, }, ++}; ++ ++static struct sched_domain_topology_level *sched_domain_topology = ++ default_topology; ++ ++#define for_each_sd_topology(tl) \ ++ for (tl = sched_domain_topology; tl->mask; tl++) ++ ++void set_sched_topology(struct sched_domain_topology_level *tl) ++{ ++ if (WARN_ON_ONCE(sched_smp_initialized)) ++ return; ++ ++ sched_domain_topology = tl; ++} ++ ++/* ++ * Initializers for schedule domains ++ * Non-inlined to reduce accumulated stack pressure in build_sched_domains() ++ */ ++ ++int sched_domain_level_max; ++ ++/* ++ * Partition sched domains as specified by the 'ndoms_new' ++ * cpumasks in the array doms_new[] of cpumasks. This compares ++ * doms_new[] to the current sched domain partitioning, doms_cur[]. ++ * It destroys each deleted domain and builds each new domain. ++ * ++ * 'doms_new' is an array of cpumask_var_t's of length 'ndoms_new'. ++ * The masks don't intersect (don't overlap.) We should setup one ++ * sched domain for each mask. CPUs not in any of the cpumasks will ++ * not be load balanced. If the same cpumask appears both in the ++ * current 'doms_cur' domains and in the new 'doms_new', we can leave ++ * it as it is. ++ * ++ * The passed in 'doms_new' should be allocated using ++ * alloc_sched_domains. This routine takes ownership of it and will ++ * free_sched_domains it when done with it. If the caller failed the ++ * alloc call, then it can pass in doms_new == NULL && ndoms_new == 1, ++ * and partition_sched_domains() will fallback to the single partition ++ * 'fallback_doms', it also forces the domains to be rebuilt. ++ * ++ * If doms_new == NULL it will be replaced with cpu_online_mask. ++ * ndoms_new == 0 is a special case for destroying existing domains, ++ * and it will not create the default domain. ++ * ++ * Call with hotplug lock held ++ */ ++void partition_sched_domains(int ndoms_new, cpumask_var_t doms_new[], ++ struct sched_domain_attr *dattr_new) ++{ ++ /** ++ * PDS doesn't depend on sched domains, but just keep this api ++ */ ++} ++ ++/* ++ * used to mark begin/end of suspend/resume: ++ */ ++static int num_cpus_frozen; ++ ++#ifdef CONFIG_NUMA ++int __read_mostly node_reclaim_distance = RECLAIM_DISTANCE; ++ ++/* ++ * sched_numa_find_closest() - given the NUMA topology, find the cpu ++ * closest to @cpu from @cpumask. ++ * cpumask: cpumask to find a cpu from ++ * cpu: cpu to be close to ++ * ++ * returns: cpu, or nr_cpu_ids when nothing found. ++ */ ++int sched_numa_find_closest(const struct cpumask *cpus, int cpu) ++{ ++ return best_mask_cpu(cpu, cpus); ++} ++#endif /* CONFIG_NUMA */ ++ ++/* ++ * Update cpusets according to cpu_active mask. If cpusets are ++ * disabled, cpuset_update_active_cpus() becomes a simple wrapper ++ * around partition_sched_domains(). ++ * ++ * If we come here as part of a suspend/resume, don't touch cpusets because we ++ * want to restore it back to its original state upon resume anyway. ++ */ ++static void cpuset_cpu_active(void) ++{ ++ if (cpuhp_tasks_frozen) { ++ /* ++ * num_cpus_frozen tracks how many CPUs are involved in suspend ++ * resume sequence. As long as this is not the last online ++ * operation in the resume sequence, just build a single sched ++ * domain, ignoring cpusets. ++ */ ++ partition_sched_domains(1, NULL, NULL); ++ if (--num_cpus_frozen) ++ return; ++ /* ++ * This is the last CPU online operation. So fall through and ++ * restore the original sched domains by considering the ++ * cpuset configurations. ++ */ ++ cpuset_force_rebuild(); ++ } ++ ++ cpuset_update_active_cpus(); ++} ++ ++static int cpuset_cpu_inactive(unsigned int cpu) ++{ ++ if (!cpuhp_tasks_frozen) { ++ cpuset_update_active_cpus(); ++ } else { ++ num_cpus_frozen++; ++ partition_sched_domains(1, NULL, NULL); ++ } ++ return 0; ++} ++ ++int sched_cpu_activate(unsigned int cpu) ++{ ++ struct rq *rq = cpu_rq(cpu); ++ unsigned long flags; ++ ++#ifdef CONFIG_SCHED_SMT ++ /* ++ * When going up, increment the number of cores with SMT present. ++ */ ++ if (cpumask_weight(cpu_smt_mask(cpu)) == 2) ++ static_branch_inc_cpuslocked(&sched_smt_present); ++#endif ++ set_cpu_active(cpu, true); ++ ++ if (sched_smp_initialized) ++ cpuset_cpu_active(); ++ ++ /* ++ * Put the rq online, if not already. This happens: ++ * ++ * 1) In the early boot process, because we build the real domains ++ * after all cpus have been brought up. ++ * ++ * 2) At runtime, if cpuset_cpu_active() fails to rebuild the ++ * domains. ++ */ ++ raw_spin_lock_irqsave(&rq->lock, flags); ++ set_rq_online(rq); ++ raw_spin_unlock_irqrestore(&rq->lock, flags); ++ ++ return 0; ++} ++ ++int sched_cpu_deactivate(unsigned int cpu) ++{ ++ int ret; ++ ++ set_cpu_active(cpu, false); ++ /* ++ * We've cleared cpu_active_mask, wait for all preempt-disabled and RCU ++ * users of this state to go away such that all new such users will ++ * observe it. ++ * ++ * Do sync before park smpboot threads to take care the rcu boost case. ++ */ ++ synchronize_rcu(); ++ ++#ifdef CONFIG_SCHED_SMT ++ /* ++ * When going down, decrement the number of cores with SMT present. ++ */ ++ if (cpumask_weight(cpu_smt_mask(cpu)) == 2) ++ static_branch_dec_cpuslocked(&sched_smt_present); ++#endif ++ ++ if (!sched_smp_initialized) ++ return 0; ++ ++ ret = cpuset_cpu_inactive(cpu); ++ if (ret) { ++ set_cpu_active(cpu, true); ++ return ret; ++ } ++ return 0; ++} ++ ++static void sched_rq_cpu_starting(unsigned int cpu) ++{ ++ struct rq *rq = cpu_rq(cpu); ++ ++ rq->calc_load_update = calc_load_update; ++} ++ ++int sched_cpu_starting(unsigned int cpu) ++{ ++ sched_rq_cpu_starting(cpu); ++ sched_tick_start(cpu); ++ return 0; ++} ++ ++#ifdef CONFIG_HOTPLUG_CPU ++int sched_cpu_dying(unsigned int cpu) ++{ ++ struct rq *rq = cpu_rq(cpu); ++ unsigned long flags; ++ ++ sched_tick_stop(cpu); ++ raw_spin_lock_irqsave(&rq->lock, flags); ++ set_rq_offline(rq); ++ migrate_tasks(rq); ++ raw_spin_unlock_irqrestore(&rq->lock, flags); ++ ++ hrtick_clear(rq); ++ return 0; ++} ++#endif ++ ++#ifdef CONFIG_SMP ++static void sched_init_topology_cpumask_early(void) ++{ ++ int cpu, level; ++ cpumask_t *tmp; ++ ++ for_each_possible_cpu(cpu) { ++ for (level = 0; level < NR_CPU_AFFINITY_CHK_LEVEL; level++) { ++ tmp = &(per_cpu(sched_cpu_affinity_chk_masks, cpu)[level]); ++ cpumask_copy(tmp, cpu_possible_mask); ++ cpumask_clear_cpu(cpu, tmp); ++ } ++ per_cpu(sched_cpu_llc_start_mask, cpu) = ++ &(per_cpu(sched_cpu_affinity_chk_masks, cpu)[0]); ++ per_cpu(sched_cpu_affinity_chk_end_masks, cpu) = ++ &(per_cpu(sched_cpu_affinity_chk_masks, cpu)[1]); ++ } ++} ++ ++static void sched_init_topology_cpumask(void) ++{ ++ int cpu; ++ cpumask_t *chk; ++ ++ for_each_online_cpu(cpu) { ++ chk = &(per_cpu(sched_cpu_affinity_chk_masks, cpu)[0]); ++ ++#ifdef CONFIG_SCHED_SMT ++ cpumask_setall(chk); ++ cpumask_clear_cpu(cpu, chk); ++ if (cpumask_and(chk, chk, topology_sibling_cpumask(cpu))) { ++ per_cpu(sched_sibling_cpu, cpu) = cpumask_first(chk); ++ printk(KERN_INFO "pds: cpu #%d affinity check mask - smt 0x%08lx", ++ cpu, (chk++)->bits[0]); ++ } ++#endif ++#ifdef CONFIG_SCHED_MC ++ cpumask_setall(chk); ++ cpumask_clear_cpu(cpu, chk); ++ if (cpumask_and(chk, chk, cpu_coregroup_mask(cpu))) { ++ per_cpu(sched_cpu_llc_start_mask, cpu) = chk; ++ printk(KERN_INFO "pds: cpu #%d affinity check mask - coregroup 0x%08lx", ++ cpu, (chk++)->bits[0]); ++ } ++ cpumask_complement(chk, cpu_coregroup_mask(cpu)); ++ ++ /** ++ * Set up sd_llc_id per CPU ++ */ ++ per_cpu(sd_llc_id, cpu) = ++ cpumask_first(cpu_coregroup_mask(cpu)); ++#else ++ per_cpu(sd_llc_id, cpu) = ++ cpumask_first(topology_core_cpumask(cpu)); ++ ++ per_cpu(sched_cpu_llc_start_mask, cpu) = chk; ++ ++ cpumask_setall(chk); ++ cpumask_clear_cpu(cpu, chk); ++#endif /* NOT CONFIG_SCHED_MC */ ++ if (cpumask_and(chk, chk, topology_core_cpumask(cpu))) ++ printk(KERN_INFO "pds: cpu #%d affinity check mask - core 0x%08lx", ++ cpu, (chk++)->bits[0]); ++ cpumask_complement(chk, topology_core_cpumask(cpu)); ++ ++ if (cpumask_and(chk, chk, cpu_online_mask)) ++ printk(KERN_INFO "pds: cpu #%d affinity check mask - others 0x%08lx", ++ cpu, (chk++)->bits[0]); ++ ++ per_cpu(sched_cpu_affinity_chk_end_masks, cpu) = chk; ++ } ++} ++#endif ++ ++void __init sched_init_smp(void) ++{ ++ /* Move init over to a non-isolated CPU */ ++ if (set_cpus_allowed_ptr(current, housekeeping_cpumask(HK_FLAG_DOMAIN)) < 0) ++ BUG(); ++ ++ cpumask_copy(&sched_rq_queued_masks[SCHED_RQ_EMPTY], cpu_online_mask); ++ ++ sched_init_topology_cpumask(); ++ ++ sched_smp_initialized = true; ++} ++#else ++void __init sched_init_smp(void) ++{ ++} ++#endif /* CONFIG_SMP */ ++ ++int in_sched_functions(unsigned long addr) ++{ ++ return in_lock_functions(addr) || ++ (addr >= (unsigned long)__sched_text_start ++ && addr < (unsigned long)__sched_text_end); ++} ++ ++#ifdef CONFIG_CGROUP_SCHED ++/* task group related information */ ++struct task_group { ++ struct cgroup_subsys_state css; ++ ++ struct rcu_head rcu; ++ struct list_head list; ++ ++ struct task_group *parent; ++ struct list_head siblings; ++ struct list_head children; ++}; ++ ++/* ++ * Default task group. ++ * Every task in system belongs to this group at bootup. ++ */ ++struct task_group root_task_group; ++LIST_HEAD(task_groups); ++ ++/* Cacheline aligned slab cache for task_group */ ++static struct kmem_cache *task_group_cache __read_mostly; ++#endif /* CONFIG_CGROUP_SCHED */ ++ ++void __init sched_init(void) ++{ ++ int i; ++ struct rq *rq; ++ ++ print_scheduler_version(); ++ ++ wait_bit_init(); ++ ++#ifdef CONFIG_SMP ++ for (i = 0; i < NR_SCHED_RQ_QUEUED_LEVEL; i++) ++ cpumask_clear(&sched_rq_queued_masks[i]); ++ cpumask_setall(&sched_rq_queued_masks[SCHED_RQ_EMPTY]); ++ set_bit(SCHED_RQ_EMPTY, sched_rq_queued_masks_bitmap); ++ ++ cpumask_setall(&sched_rq_pending_masks[SCHED_RQ_EMPTY]); ++ set_bit(SCHED_RQ_EMPTY, sched_rq_pending_masks_bitmap); ++#else ++ uprq = &per_cpu(runqueues, 0); ++#endif ++ ++#ifdef CONFIG_CGROUP_SCHED ++ task_group_cache = KMEM_CACHE(task_group, 0); ++ ++ list_add(&root_task_group.list, &task_groups); ++ INIT_LIST_HEAD(&root_task_group.children); ++ INIT_LIST_HEAD(&root_task_group.siblings); ++#endif /* CONFIG_CGROUP_SCHED */ ++ for_each_possible_cpu(i) { ++ rq = cpu_rq(i); ++ FULL_INIT_SKIPLIST_NODE(&rq->sl_header); ++ raw_spin_lock_init(&rq->lock); ++ rq->dither = 0; ++ rq->nr_running = rq->nr_uninterruptible = 0; ++ rq->calc_load_active = 0; ++ rq->calc_load_update = jiffies + LOAD_FREQ; ++#ifdef CONFIG_SMP ++ rq->online = false; ++ rq->cpu = i; ++ ++ rq->queued_level = SCHED_RQ_EMPTY; ++ rq->pending_level = SCHED_RQ_EMPTY; ++#ifdef CONFIG_SCHED_SMT ++ per_cpu(sched_sibling_cpu, i) = i; ++ rq->active_balance = 0; ++#endif ++#endif ++ rq->nr_switches = 0; ++ atomic_set(&rq->nr_iowait, 0); ++ hrtick_rq_init(rq); ++ } ++#ifdef CONFIG_SMP ++ /* Set rq->online for cpu 0 */ ++ cpu_rq(0)->online = true; ++#endif ++ ++ /* ++ * The boot idle thread does lazy MMU switching as well: ++ */ ++ mmgrab(&init_mm); ++ enter_lazy_tlb(&init_mm, current); ++ ++ /* ++ * Make us the idle thread. Technically, schedule() should not be ++ * called from this thread, however somewhere below it might be, ++ * but because we are the idle thread, we just pick up running again ++ * when this runqueue becomes "idle". ++ */ ++ init_idle(current, smp_processor_id()); ++ ++ calc_load_update = jiffies + LOAD_FREQ; ++ ++#ifdef CONFIG_SMP ++ idle_thread_set_boot_cpu(); ++ ++ sched_init_topology_cpumask_early(); ++#endif /* SMP */ ++ ++ init_schedstats(); ++ ++ psi_init(); ++} ++ ++#ifdef CONFIG_DEBUG_ATOMIC_SLEEP ++static inline int preempt_count_equals(int preempt_offset) ++{ ++ int nested = preempt_count() + rcu_preempt_depth(); ++ ++ return (nested == preempt_offset); ++} ++ ++void __might_sleep(const char *file, int line, int preempt_offset) ++{ ++ /* ++ * Blocking primitives will set (and therefore destroy) current->state, ++ * since we will exit with TASK_RUNNING make sure we enter with it, ++ * otherwise we will destroy state. ++ */ ++ WARN_ONCE(current->state != TASK_RUNNING && current->task_state_change, ++ "do not call blocking ops when !TASK_RUNNING; " ++ "state=%lx set at [<%p>] %pS\n", ++ current->state, ++ (void *)current->task_state_change, ++ (void *)current->task_state_change); ++ ++ ___might_sleep(file, line, preempt_offset); ++} ++EXPORT_SYMBOL(__might_sleep); ++ ++void ___might_sleep(const char *file, int line, int preempt_offset) ++{ ++ /* Ratelimiting timestamp: */ ++ static unsigned long prev_jiffy; ++ ++ unsigned long preempt_disable_ip; ++ ++ /* WARN_ON_ONCE() by default, no rate limit required: */ ++ rcu_sleep_check(); ++ ++ if ((preempt_count_equals(preempt_offset) && !irqs_disabled() && ++ !is_idle_task(current) && !current->non_block_count) || ++ system_state == SYSTEM_BOOTING || system_state > SYSTEM_RUNNING || ++ oops_in_progress) ++ return; ++ if (time_before(jiffies, prev_jiffy + HZ) && prev_jiffy) ++ return; ++ prev_jiffy = jiffies; ++ ++ /* Save this before calling printk(), since that will clobber it: */ ++ preempt_disable_ip = get_preempt_disable_ip(current); ++ ++ printk(KERN_ERR ++ "BUG: sleeping function called from invalid context at %s:%d\n", ++ file, line); ++ printk(KERN_ERR ++ "in_atomic(): %d, irqs_disabled(): %d, non_block: %d, pid: %d, name: %s\n", ++ in_atomic(), irqs_disabled(), current->non_block_count, ++ current->pid, current->comm); ++ ++ if (task_stack_end_corrupted(current)) ++ printk(KERN_EMERG "Thread overran stack, or stack corrupted\n"); ++ ++ debug_show_held_locks(current); ++ if (irqs_disabled()) ++ print_irqtrace_events(current); ++#ifdef CONFIG_DEBUG_PREEMPT ++ if (!preempt_count_equals(preempt_offset)) { ++ pr_err("Preemption disabled at:"); ++ print_ip_sym(preempt_disable_ip); ++ pr_cont("\n"); ++ } ++#endif ++ dump_stack(); ++ add_taint(TAINT_WARN, LOCKDEP_STILL_OK); ++} ++EXPORT_SYMBOL(___might_sleep); ++ ++void __cant_sleep(const char *file, int line, int preempt_offset) ++{ ++ static unsigned long prev_jiffy; ++ ++ if (irqs_disabled()) ++ return; ++ ++ if (!IS_ENABLED(CONFIG_PREEMPT_COUNT)) ++ return; ++ ++ if (preempt_count() > preempt_offset) ++ return; ++ ++ if (time_before(jiffies, prev_jiffy + HZ) && prev_jiffy) ++ return; ++ prev_jiffy = jiffies; ++ ++ printk(KERN_ERR "BUG: assuming atomic context at %s:%d\n", file, line); ++ printk(KERN_ERR "in_atomic(): %d, irqs_disabled(): %d, pid: %d, name: %s\n", ++ in_atomic(), irqs_disabled(), ++ current->pid, current->comm); ++ ++ debug_show_held_locks(current); ++ dump_stack(); ++ add_taint(TAINT_WARN, LOCKDEP_STILL_OK); ++} ++EXPORT_SYMBOL_GPL(__cant_sleep); ++#endif ++ ++#ifdef CONFIG_MAGIC_SYSRQ ++void normalize_rt_tasks(void) ++{ ++ struct task_struct *g, *p; ++ struct sched_attr attr = { ++ .sched_policy = SCHED_NORMAL, ++ }; ++ ++ read_lock(&tasklist_lock); ++ for_each_process_thread(g, p) { ++ /* ++ * Only normalize user tasks: ++ */ ++ if (p->flags & PF_KTHREAD) ++ continue; ++ ++ if (!rt_task(p)) { ++ /* ++ * Renice negative nice level userspace ++ * tasks back to 0: ++ */ ++ if (task_nice(p) < 0) ++ set_user_nice(p, 0); ++ continue; ++ } ++ ++ __sched_setscheduler(p, &attr, false, false); ++ } ++ read_unlock(&tasklist_lock); ++} ++#endif /* CONFIG_MAGIC_SYSRQ */ ++ ++#if defined(CONFIG_IA64) || defined(CONFIG_KGDB_KDB) ++/* ++ * These functions are only useful for the IA64 MCA handling, or kdb. ++ * ++ * They can only be called when the whole system has been ++ * stopped - every CPU needs to be quiescent, and no scheduling ++ * activity can take place. Using them for anything else would ++ * be a serious bug, and as a result, they aren't even visible ++ * under any other configuration. ++ */ ++ ++/** ++ * curr_task - return the current task for a given CPU. ++ * @cpu: the processor in question. ++ * ++ * ONLY VALID WHEN THE WHOLE SYSTEM IS STOPPED! ++ * ++ * Return: The current task for @cpu. ++ */ ++struct task_struct *curr_task(int cpu) ++{ ++ return cpu_curr(cpu); ++} ++ ++#endif /* defined(CONFIG_IA64) || defined(CONFIG_KGDB_KDB) */ ++ ++#ifdef CONFIG_IA64 ++/** ++ * ia64_set_curr_task - set the current task for a given CPU. ++ * @cpu: the processor in question. ++ * @p: the task pointer to set. ++ * ++ * Description: This function must only be used when non-maskable interrupts ++ * are serviced on a separate stack. It allows the architecture to switch the ++ * notion of the current task on a CPU in a non-blocking manner. This function ++ * must be called with all CPU's synchronised, and interrupts disabled, the ++ * and caller must save the original value of the current task (see ++ * curr_task() above) and restore that value before reenabling interrupts and ++ * re-starting the system. ++ * ++ * ONLY VALID WHEN THE WHOLE SYSTEM IS STOPPED! ++ */ ++void ia64_set_curr_task(int cpu, struct task_struct *p) ++{ ++ cpu_curr(cpu) = p; ++} ++ ++#endif ++ ++#ifdef CONFIG_SCHED_DEBUG ++void proc_sched_show_task(struct task_struct *p, struct pid_namespace *ns, ++ struct seq_file *m) ++{} ++ ++void proc_sched_set_task(struct task_struct *p) ++{} ++#endif ++ ++#ifdef CONFIG_CGROUP_SCHED ++static void sched_free_group(struct task_group *tg) ++{ ++ kmem_cache_free(task_group_cache, tg); ++} ++ ++/* allocate runqueue etc for a new task group */ ++struct task_group *sched_create_group(struct task_group *parent) ++{ ++ struct task_group *tg; ++ ++ tg = kmem_cache_alloc(task_group_cache, GFP_KERNEL | __GFP_ZERO); ++ if (!tg) ++ return ERR_PTR(-ENOMEM); ++ ++ return tg; ++} ++ ++void sched_online_group(struct task_group *tg, struct task_group *parent) ++{ ++} ++ ++/* rcu callback to free various structures associated with a task group */ ++static void sched_free_group_rcu(struct rcu_head *rhp) ++{ ++ /* Now it should be safe to free those cfs_rqs */ ++ sched_free_group(container_of(rhp, struct task_group, rcu)); ++} ++ ++void sched_destroy_group(struct task_group *tg) ++{ ++ /* Wait for possible concurrent references to cfs_rqs complete */ ++ call_rcu(&tg->rcu, sched_free_group_rcu); ++} ++ ++void sched_offline_group(struct task_group *tg) ++{ ++} ++ ++static inline struct task_group *css_tg(struct cgroup_subsys_state *css) ++{ ++ return css ? container_of(css, struct task_group, css) : NULL; ++} ++ ++static struct cgroup_subsys_state * ++cpu_cgroup_css_alloc(struct cgroup_subsys_state *parent_css) ++{ ++ struct task_group *parent = css_tg(parent_css); ++ struct task_group *tg; ++ ++ if (!parent) { ++ /* This is early initialization for the top cgroup */ ++ return &root_task_group.css; ++ } ++ ++ tg = sched_create_group(parent); ++ if (IS_ERR(tg)) ++ return ERR_PTR(-ENOMEM); ++ return &tg->css; ++} ++ ++/* Expose task group only after completing cgroup initialization */ ++static int cpu_cgroup_css_online(struct cgroup_subsys_state *css) ++{ ++ struct task_group *tg = css_tg(css); ++ struct task_group *parent = css_tg(css->parent); ++ ++ if (parent) ++ sched_online_group(tg, parent); ++ return 0; ++} ++ ++static void cpu_cgroup_css_released(struct cgroup_subsys_state *css) ++{ ++ struct task_group *tg = css_tg(css); ++ ++ sched_offline_group(tg); ++} ++ ++static void cpu_cgroup_css_free(struct cgroup_subsys_state *css) ++{ ++ struct task_group *tg = css_tg(css); ++ ++ /* ++ * Relies on the RCU grace period between css_released() and this. ++ */ ++ sched_free_group(tg); ++} ++ ++static void cpu_cgroup_fork(struct task_struct *task) ++{ ++} ++ ++static int cpu_cgroup_can_attach(struct cgroup_taskset *tset) ++{ ++ return 0; ++} ++ ++static void cpu_cgroup_attach(struct cgroup_taskset *tset) ++{ ++} ++ ++static struct cftype cpu_legacy_files[] = { ++ { } /* Terminate */ ++}; ++ ++static struct cftype cpu_files[] = { ++ { } /* terminate */ ++}; ++ ++static int cpu_extra_stat_show(struct seq_file *sf, ++ struct cgroup_subsys_state *css) ++{ ++ return 0; ++} ++ ++struct cgroup_subsys cpu_cgrp_subsys = { ++ .css_alloc = cpu_cgroup_css_alloc, ++ .css_online = cpu_cgroup_css_online, ++ .css_released = cpu_cgroup_css_released, ++ .css_free = cpu_cgroup_css_free, ++ .css_extra_stat_show = cpu_extra_stat_show, ++ .fork = cpu_cgroup_fork, ++ .can_attach = cpu_cgroup_can_attach, ++ .attach = cpu_cgroup_attach, ++ .legacy_cftypes = cpu_files, ++ .legacy_cftypes = cpu_legacy_files, ++ .dfl_cftypes = cpu_files, ++ .early_init = true, ++ .threaded = true, ++}; ++#endif /* CONFIG_CGROUP_SCHED */ ++ ++#undef CREATE_TRACE_POINTS +diff --git a/kernel/sched/pds_sched.h b/kernel/sched/pds_sched.h +new file mode 100644 +index 000000000000..b3926a8425b2 +--- /dev/null ++++ b/kernel/sched/pds_sched.h +@@ -0,0 +1,474 @@ ++#ifndef PDS_SCHED_H ++#define PDS_SCHED_H ++ ++#include ++ ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++ ++#include ++ ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++ ++#include ++ ++#ifdef CONFIG_PARAVIRT ++# include ++#endif ++ ++#include "cpupri.h" ++ ++/* task_struct::on_rq states: */ ++#define TASK_ON_RQ_QUEUED 1 ++#define TASK_ON_RQ_MIGRATING 2 ++ ++static inline int task_on_rq_queued(struct task_struct *p) ++{ ++ return p->on_rq == TASK_ON_RQ_QUEUED; ++} ++ ++static inline int task_on_rq_migrating(struct task_struct *p) ++{ ++ return READ_ONCE(p->on_rq) == TASK_ON_RQ_MIGRATING; ++} ++ ++/* ++ * This is the main, per-CPU runqueue data structure. ++ * This data should only be modified by the local cpu. ++ */ ++struct rq { ++ /* runqueue lock: */ ++ raw_spinlock_t lock; ++ ++ struct task_struct *curr, *idle, *stop; ++ struct mm_struct *prev_mm; ++ ++ struct skiplist_node sl_header; ++ ++ /* switch count */ ++ u64 nr_switches; ++ ++ atomic_t nr_iowait; ++ ++#ifdef CONFIG_MEMBARRIER ++ int membarrier_state; ++#endif ++ ++#ifdef CONFIG_SMP ++ int cpu; /* cpu of this runqueue */ ++ bool online; ++ ++#ifdef CONFIG_HAVE_SCHED_AVG_IRQ ++ struct sched_avg avg_irq; ++#endif ++ ++ unsigned long queued_level; ++ unsigned long pending_level; ++ ++#ifdef CONFIG_SCHED_SMT ++ int active_balance; ++ struct cpu_stop_work active_balance_work; ++#endif ++#endif /* CONFIG_SMP */ ++#ifdef CONFIG_IRQ_TIME_ACCOUNTING ++ u64 prev_irq_time; ++#endif /* CONFIG_IRQ_TIME_ACCOUNTING */ ++#ifdef CONFIG_PARAVIRT ++ u64 prev_steal_time; ++#endif /* CONFIG_PARAVIRT */ ++#ifdef CONFIG_PARAVIRT_TIME_ACCOUNTING ++ u64 prev_steal_time_rq; ++#endif /* CONFIG_PARAVIRT_TIME_ACCOUNTING */ ++ ++ /* calc_load related fields */ ++ unsigned long calc_load_update; ++ long calc_load_active; ++ ++ u64 clock, last_tick; ++ u64 clock_task; ++ int dither; ++ ++ unsigned long nr_running; ++ unsigned long nr_uninterruptible; ++ ++#ifdef CONFIG_SCHED_HRTICK ++#ifdef CONFIG_SMP ++ int hrtick_csd_pending; ++ call_single_data_t hrtick_csd; ++#endif ++ struct hrtimer hrtick_timer; ++#endif ++ ++#ifdef CONFIG_SCHEDSTATS ++ ++ /* latency stats */ ++ struct sched_info rq_sched_info; ++ unsigned long long rq_cpu_time; ++ /* could above be rq->cfs_rq.exec_clock + rq->rt_rq.rt_runtime ? */ ++ ++ /* sys_sched_yield() stats */ ++ unsigned int yld_count; ++ ++ /* schedule() stats */ ++ unsigned int sched_switch; ++ unsigned int sched_count; ++ unsigned int sched_goidle; ++ ++ /* try_to_wake_up() stats */ ++ unsigned int ttwu_count; ++ unsigned int ttwu_local; ++#endif /* CONFIG_SCHEDSTATS */ ++#ifdef CONFIG_CPU_IDLE ++ /* Must be inspected within a rcu lock section */ ++ struct cpuidle_state *idle_state; ++#endif ++}; ++ ++extern unsigned long calc_load_update; ++extern atomic_long_t calc_load_tasks; ++ ++extern void calc_global_load_tick(struct rq *this_rq); ++extern long calc_load_fold_active(struct rq *this_rq, long adjust); ++ ++#ifndef CONFIG_SMP ++extern struct rq *uprq; ++#define cpu_rq(cpu) (uprq) ++#define this_rq() (uprq) ++#define raw_rq() (uprq) ++#define task_rq(p) (uprq) ++#define cpu_curr(cpu) ((uprq)->curr) ++#else /* CONFIG_SMP */ ++DECLARE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues); ++#define cpu_rq(cpu) (&per_cpu(runqueues, (cpu))) ++#define this_rq() this_cpu_ptr(&runqueues) ++#define raw_rq() raw_cpu_ptr(&runqueues) ++#define task_rq(p) cpu_rq(task_cpu(p)) ++#define cpu_curr(cpu) (cpu_rq(cpu)->curr) ++ ++#if defined(CONFIG_SCHED_DEBUG) && defined(CONFIG_SYSCTL) ++void register_sched_domain_sysctl(void); ++void unregister_sched_domain_sysctl(void); ++#else ++static inline void register_sched_domain_sysctl(void) ++{ ++} ++static inline void unregister_sched_domain_sysctl(void) ++{ ++} ++#endif ++ ++#endif /* CONFIG_SMP */ ++ ++#ifndef arch_scale_freq_capacity ++static __always_inline ++unsigned long arch_scale_freq_capacity(int cpu) ++{ ++ return SCHED_CAPACITY_SCALE; ++} ++#endif ++ ++static inline u64 __rq_clock_broken(struct rq *rq) ++{ ++ return READ_ONCE(rq->clock); ++} ++ ++static inline u64 rq_clock(struct rq *rq) ++{ ++ /* ++ * Relax lockdep_assert_held() checking as in VRQ, call to ++ * sched_info_xxxx() may not held rq->lock ++ * lockdep_assert_held(&rq->lock); ++ */ ++ return rq->clock; ++} ++ ++static inline u64 rq_clock_task(struct rq *rq) ++{ ++ /* ++ * Relax lockdep_assert_held() checking as in VRQ, call to ++ * sched_info_xxxx() may not held rq->lock ++ * lockdep_assert_held(&rq->lock); ++ */ ++ return rq->clock_task; ++} ++ ++/* ++ * {de,en}queue flags: ++ * ++ * DEQUEUE_SLEEP - task is no longer runnable ++ * ENQUEUE_WAKEUP - task just became runnable ++ * ++ */ ++ ++#define DEQUEUE_SLEEP 0x01 ++ ++#define ENQUEUE_WAKEUP 0x01 ++ ++ ++/* ++ * Below are scheduler API which using in other kernel code ++ * It use the dummy rq_flags ++ * ToDo : PDS need to support these APIs for compatibility with mainline ++ * scheduler code. ++ */ ++struct rq_flags { ++ unsigned long flags; ++}; ++ ++struct rq *__task_rq_lock(struct task_struct *p, struct rq_flags *rf) ++ __acquires(rq->lock); ++ ++struct rq *task_rq_lock(struct task_struct *p, struct rq_flags *rf) ++ __acquires(p->pi_lock) ++ __acquires(rq->lock); ++ ++static inline void __task_rq_unlock(struct rq *rq, struct rq_flags *rf) ++ __releases(rq->lock) ++{ ++ raw_spin_unlock(&rq->lock); ++} ++ ++static inline void ++task_rq_unlock(struct rq *rq, struct task_struct *p, struct rq_flags *rf) ++ __releases(rq->lock) ++ __releases(p->pi_lock) ++{ ++ raw_spin_unlock(&rq->lock); ++ raw_spin_unlock_irqrestore(&p->pi_lock, rf->flags); ++} ++ ++static inline void ++rq_unlock_irq(struct rq *rq, struct rq_flags *rf) ++ __releases(rq->lock) ++{ ++ raw_spin_unlock_irq(&rq->lock); ++} ++ ++static inline struct rq * ++this_rq_lock_irq(struct rq_flags *rf) ++ __acquires(rq->lock) ++{ ++ struct rq *rq; ++ ++ local_irq_disable(); ++ rq = this_rq(); ++ raw_spin_lock(&rq->lock); ++ ++ return rq; ++} ++ ++static inline bool task_running(struct task_struct *p) ++{ ++ return p->on_cpu; ++} ++ ++extern struct static_key_false sched_schedstats; ++ ++static inline void sched_ttwu_pending(void) { } ++ ++#ifdef CONFIG_CPU_IDLE ++static inline void idle_set_state(struct rq *rq, ++ struct cpuidle_state *idle_state) ++{ ++ rq->idle_state = idle_state; ++} ++ ++static inline struct cpuidle_state *idle_get_state(struct rq *rq) ++{ ++ WARN_ON(!rcu_read_lock_held()); ++ return rq->idle_state; ++} ++#else ++static inline void idle_set_state(struct rq *rq, ++ struct cpuidle_state *idle_state) ++{ ++} ++ ++static inline struct cpuidle_state *idle_get_state(struct rq *rq) ++{ ++ return NULL; ++} ++#endif ++ ++static inline int cpu_of(const struct rq *rq) ++{ ++#ifdef CONFIG_SMP ++ return rq->cpu; ++#else ++ return 0; ++#endif ++} ++ ++#include "stats.h" ++ ++#ifdef CONFIG_IRQ_TIME_ACCOUNTING ++struct irqtime { ++ u64 total; ++ u64 tick_delta; ++ u64 irq_start_time; ++ struct u64_stats_sync sync; ++}; ++ ++DECLARE_PER_CPU(struct irqtime, cpu_irqtime); ++ ++/* ++ * Returns the irqtime minus the softirq time computed by ksoftirqd. ++ * Otherwise ksoftirqd's sum_exec_runtime is substracted its own runtime ++ * and never move forward. ++ */ ++static inline u64 irq_time_read(int cpu) ++{ ++ struct irqtime *irqtime = &per_cpu(cpu_irqtime, cpu); ++ unsigned int seq; ++ u64 total; ++ ++ do { ++ seq = __u64_stats_fetch_begin(&irqtime->sync); ++ total = irqtime->total; ++ } while (__u64_stats_fetch_retry(&irqtime->sync, seq)); ++ ++ return total; ++} ++#endif /* CONFIG_IRQ_TIME_ACCOUNTING */ ++ ++#ifdef CONFIG_CPU_FREQ ++DECLARE_PER_CPU(struct update_util_data __rcu *, cpufreq_update_util_data); ++ ++/** ++ * cpufreq_update_util - Take a note about CPU utilization changes. ++ * @rq: Runqueue to carry out the update for. ++ * @flags: Update reason flags. ++ * ++ * This function is called by the scheduler on the CPU whose utilization is ++ * being updated. ++ * ++ * It can only be called from RCU-sched read-side critical sections. ++ * ++ * The way cpufreq is currently arranged requires it to evaluate the CPU ++ * performance state (frequency/voltage) on a regular basis to prevent it from ++ * being stuck in a completely inadequate performance level for too long. ++ * That is not guaranteed to happen if the updates are only triggered from CFS ++ * and DL, though, because they may not be coming in if only RT tasks are ++ * active all the time (or there are RT tasks only). ++ * ++ * As a workaround for that issue, this function is called periodically by the ++ * RT sched class to trigger extra cpufreq updates to prevent it from stalling, ++ * but that really is a band-aid. Going forward it should be replaced with ++ * solutions targeted more specifically at RT tasks. ++ */ ++static inline void cpufreq_update_util(struct rq *rq, unsigned int flags) ++{ ++ struct update_util_data *data; ++ ++ data = rcu_dereference_sched(*this_cpu_ptr(&cpufreq_update_util_data)); ++ if (data) ++ data->func(data, rq_clock(rq), flags); ++} ++ ++static inline void cpufreq_update_this_cpu(struct rq *rq, unsigned int flags) ++{ ++ if (cpu_of(rq) == smp_processor_id()) ++ cpufreq_update_util(rq, flags); ++} ++#else ++static inline void cpufreq_update_util(struct rq *rq, unsigned int flags) {} ++static inline void cpufreq_update_this_cpu(struct rq *rq, unsigned int flags) {} ++#endif /* CONFIG_CPU_FREQ */ ++ ++#ifdef CONFIG_NO_HZ_FULL ++extern int __init sched_tick_offload_init(void); ++#else ++static inline int sched_tick_offload_init(void) { return 0; } ++#endif ++ ++#ifdef arch_scale_freq_capacity ++#ifndef arch_scale_freq_invariant ++#define arch_scale_freq_invariant() (true) ++#endif ++#else /* arch_scale_freq_capacity */ ++#define arch_scale_freq_invariant() (false) ++#endif ++ ++extern void schedule_idle(void); ++ ++/* ++ * !! For sched_setattr_nocheck() (kernel) only !! ++ * ++ * This is actually gross. :( ++ * ++ * It is used to make schedutil kworker(s) higher priority than SCHED_DEADLINE ++ * tasks, but still be able to sleep. We need this on platforms that cannot ++ * atomically change clock frequency. Remove once fast switching will be ++ * available on such platforms. ++ * ++ * SUGOV stands for SchedUtil GOVernor. ++ */ ++#define SCHED_FLAG_SUGOV 0x10000000 ++ ++#ifdef CONFIG_MEMBARRIER ++/* ++ * The scheduler provides memory barriers required by membarrier between: ++ * - prior user-space memory accesses and store to rq->membarrier_state, ++ * - store to rq->membarrier_state and following user-space memory accesses. ++ * In the same way it provides those guarantees around store to rq->curr. ++ */ ++static inline void membarrier_switch_mm(struct rq *rq, ++ struct mm_struct *prev_mm, ++ struct mm_struct *next_mm) ++{ ++ int membarrier_state; ++ ++ if (prev_mm == next_mm) ++ return; ++ ++ membarrier_state = atomic_read(&next_mm->membarrier_state); ++ if (READ_ONCE(rq->membarrier_state) == membarrier_state) ++ return; ++ ++ WRITE_ONCE(rq->membarrier_state, membarrier_state); ++} ++#else ++static inline void membarrier_switch_mm(struct rq *rq, ++ struct mm_struct *prev_mm, ++ struct mm_struct *next_mm) ++{ ++} ++#endif ++ ++#ifdef CONFIG_NUMA ++extern int sched_numa_find_closest(const struct cpumask *cpus, int cpu); ++#else ++static inline int sched_numa_find_closest(const struct cpumask *cpus, int cpu) ++{ ++ return nr_cpu_ids; ++} ++#endif ++#endif /* PDS_SCHED_H */ +diff --git a/kernel/sched/pelt.c b/kernel/sched/pelt.c +index a96db50d40e0..d3d12baa9036 100644 +--- a/kernel/sched/pelt.c ++++ b/kernel/sched/pelt.c +@@ -236,6 +236,7 @@ ___update_load_avg(struct sched_avg *sa, unsigned long load, unsigned long runna + WRITE_ONCE(sa->util_avg, sa->util_sum / divider); + } + ++#ifndef CONFIG_SCHED_PDS + /* + * sched_entity: + * +@@ -352,6 +353,7 @@ int update_dl_rq_load_avg(u64 now, struct rq *rq, int running) + + return 0; + } ++#endif + + #ifdef CONFIG_HAVE_SCHED_AVG_IRQ + /* +diff --git a/kernel/sched/pelt.h b/kernel/sched/pelt.h +index afff644da065..26d6b47fc156 100644 +--- a/kernel/sched/pelt.h ++++ b/kernel/sched/pelt.h +@@ -1,11 +1,13 @@ + #ifdef CONFIG_SMP + #include "sched-pelt.h" + ++#ifndef CONFIG_SCHED_PDS + int __update_load_avg_blocked_se(u64 now, struct sched_entity *se); + int __update_load_avg_se(u64 now, struct cfs_rq *cfs_rq, struct sched_entity *se); + int __update_load_avg_cfs_rq(u64 now, struct cfs_rq *cfs_rq); + int update_rt_rq_load_avg(u64 now, struct rq *rq, int running); + int update_dl_rq_load_avg(u64 now, struct rq *rq, int running); ++#endif + + #ifdef CONFIG_HAVE_SCHED_AVG_IRQ + int update_irq_load_avg(struct rq *rq, u64 running); +@@ -17,6 +19,7 @@ update_irq_load_avg(struct rq *rq, u64 running) + } + #endif + ++#ifndef CONFIG_SCHED_PDS + /* + * When a task is dequeued, its estimated utilization should not be update if + * its util_avg has not been updated at least once. +@@ -137,9 +140,11 @@ static inline u64 cfs_rq_clock_pelt(struct cfs_rq *cfs_rq) + return rq_clock_pelt(rq_of(cfs_rq)); + } + #endif ++#endif /* CONFIG_SCHED_PDS */ + + #else + ++#ifndef CONFIG_SCHED_PDS + static inline int + update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq) + { +@@ -157,6 +162,7 @@ update_dl_rq_load_avg(u64 now, struct rq *rq, int running) + { + return 0; + } ++#endif + + static inline int + update_irq_load_avg(struct rq *rq, u64 running) +diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h +index c8870c5bd7df..4fc9f2ead4d2 100644 +--- a/kernel/sched/sched.h ++++ b/kernel/sched/sched.h +@@ -2,6 +2,10 @@ + /* + * Scheduler internal types and methods: + */ ++#ifdef CONFIG_SCHED_PDS ++#include "pds_sched.h" ++#else ++ + #include + + #include +@@ -2496,3 +2500,4 @@ static inline void membarrier_switch_mm(struct rq *rq, + { + } + #endif ++#endif /* !CONFIG_SCHED_PDS */ +diff --git a/kernel/sched/stats.c b/kernel/sched/stats.c +index 750fb3c67eed..45bd43942575 100644 +--- a/kernel/sched/stats.c ++++ b/kernel/sched/stats.c +@@ -22,8 +22,10 @@ static int show_schedstat(struct seq_file *seq, void *v) + } else { + struct rq *rq; + #ifdef CONFIG_SMP ++#ifndef CONFIG_SCHED_PDS + struct sched_domain *sd; + int dcount = 0; ++#endif + #endif + cpu = (unsigned long)(v - 2); + rq = cpu_rq(cpu); +@@ -40,6 +42,7 @@ static int show_schedstat(struct seq_file *seq, void *v) + seq_printf(seq, "\n"); + + #ifdef CONFIG_SMP ++#ifndef CONFIG_SCHED_PDS + /* domain-specific stats */ + rcu_read_lock(); + for_each_domain(cpu, sd) { +@@ -68,6 +71,7 @@ static int show_schedstat(struct seq_file *seq, void *v) + sd->ttwu_move_balance); + } + rcu_read_unlock(); ++#endif + #endif + } + return 0; +diff --git a/kernel/sysctl.c b/kernel/sysctl.c +index b6f2f35d0bcf..204933ebc95a 100644 +--- a/kernel/sysctl.c ++++ b/kernel/sysctl.c +@@ -130,8 +130,12 @@ static int __maybe_unused four = 4; + static unsigned long zero_ul; + static unsigned long one_ul = 1; + static unsigned long long_max = LONG_MAX; +-static int one_hundred = 100; +-static int one_thousand = 1000; ++static int __read_mostly one_hundred = 100; ++static int __read_mostly one_thousand = 1000; ++#ifdef CONFIG_SCHED_PDS ++extern int rr_interval; ++extern int sched_yield_type; ++#endif + #ifdef CONFIG_PRINTK + static int ten_thousand = 10000; + #endif +@@ -300,7 +304,7 @@ static struct ctl_table sysctl_base_table[] = { + { } + }; + +-#ifdef CONFIG_SCHED_DEBUG ++#if defined(CONFIG_SCHED_DEBUG) && !defined(CONFIG_SCHED_PDS) + static int min_sched_granularity_ns = 100000; /* 100 usecs */ + static int max_sched_granularity_ns = NSEC_PER_SEC; /* 1 second */ + static int min_wakeup_granularity_ns; /* 0 usecs */ +@@ -317,6 +321,7 @@ static int max_extfrag_threshold = 1000; + #endif + + static struct ctl_table kern_table[] = { ++#ifndef CONFIG_SCHED_PDS + { + .procname = "sched_child_runs_first", + .data = &sysctl_sched_child_runs_first, +@@ -498,6 +503,7 @@ static struct ctl_table kern_table[] = { + .extra2 = SYSCTL_ONE, + }, + #endif ++#endif /* !CONFIG_SCHED_PDS */ + #ifdef CONFIG_PROVE_LOCKING + { + .procname = "prove_locking", +@@ -1070,6 +1076,26 @@ static struct ctl_table kern_table[] = { + .proc_handler = proc_dointvec, + }, + #endif ++#ifdef CONFIG_SCHED_PDS ++ { ++ .procname = "rr_interval", ++ .data = &rr_interval, ++ .maxlen = sizeof (int), ++ .mode = 0644, ++ .proc_handler = &proc_dointvec_minmax, ++ .extra1 = SYSCTL_ONE, ++ .extra2 = &one_thousand, ++ }, ++ { ++ .procname = "yield_type", ++ .data = &sched_yield_type, ++ .maxlen = sizeof (int), ++ .mode = 0644, ++ .proc_handler = &proc_dointvec_minmax, ++ .extra1 = SYSCTL_ZERO, ++ .extra2 = &two, ++ }, ++#endif + #if defined(CONFIG_S390) && defined(CONFIG_SMP) + { + .procname = "spin_retry", +diff --git a/kernel/time/posix-cpu-timers.c b/kernel/time/posix-cpu-timers.c +index 42d512fcfda2..71af3cd30ccc 100644 +--- a/kernel/time/posix-cpu-timers.c ++++ b/kernel/time/posix-cpu-timers.c +@@ -226,7 +226,7 @@ static void task_sample_cputime(struct task_struct *p, u64 *samples) + u64 stime, utime; + + task_cputime(p, &utime, &stime); +- store_samples(samples, stime, utime, p->se.sum_exec_runtime); ++ store_samples(samples, stime, utime, tsk_seruntime(p)); + } + + static void proc_sample_cputime_atomic(struct task_cputime_atomic *at, +@@ -796,6 +796,7 @@ static void collect_posix_cputimers(struct posix_cputimers *pct, u64 *samples, + } + } + ++#ifndef CONFIG_SCHED_PDS + static inline void check_dl_overrun(struct task_struct *tsk) + { + if (tsk->dl.dl_overrun) { +@@ -803,6 +804,7 @@ static inline void check_dl_overrun(struct task_struct *tsk) + __group_send_sig_info(SIGXCPU, SEND_SIG_PRIV, tsk); + } + } ++#endif + + static bool check_rlimit(u64 time, u64 limit, int signo, bool rt, bool hard) + { +@@ -830,8 +832,10 @@ static void check_thread_timers(struct task_struct *tsk, + u64 samples[CPUCLOCK_MAX]; + unsigned long soft; + ++#ifndef CONFIG_SCHED_PDS + if (dl_task(tsk)) + check_dl_overrun(tsk); ++#endif + + if (expiry_cache_is_inactive(pct)) + return; +@@ -845,7 +849,7 @@ static void check_thread_timers(struct task_struct *tsk, + soft = task_rlimit(tsk, RLIMIT_RTTIME); + if (soft != RLIM_INFINITY) { + /* Task RT timeout is accounted in jiffies. RTTIME is usec */ +- unsigned long rttime = tsk->rt.timeout * (USEC_PER_SEC / HZ); ++ unsigned long rttime = tsk_rttimeout(tsk) * (USEC_PER_SEC / HZ); + unsigned long hard = task_rlimit_max(tsk, RLIMIT_RTTIME); + + /* At the hard limit, send SIGKILL. No further action. */ +@@ -1099,8 +1103,10 @@ static inline bool fastpath_timer_check(struct task_struct *tsk) + return true; + } + ++#ifndef CONFIG_SCHED_PDS + if (dl_task(tsk) && tsk->dl.dl_overrun) + return true; ++#endif + + return false; + } +diff --git a/kernel/trace/trace_selftest.c b/kernel/trace/trace_selftest.c +index 69ee8ef12cee..3eaa2a21caa4 100644 +--- a/kernel/trace/trace_selftest.c ++++ b/kernel/trace/trace_selftest.c +@@ -1048,10 +1048,15 @@ static int trace_wakeup_test_thread(void *data) + { + /* Make this a -deadline thread */ + static const struct sched_attr attr = { ++#ifdef CONFIG_SCHED_PDS ++ /* No deadline on BFS, use RR */ ++ .sched_policy = SCHED_RR, ++#else + .sched_policy = SCHED_DEADLINE, + .sched_runtime = 100000ULL, + .sched_deadline = 10000000ULL, + .sched_period = 10000000ULL ++#endif + }; + struct wakeup_test_data *x = data; + diff --git a/linux54-tkg/linux54-tkg-patches/0006-add-acs-overrides_iommu.patch b/linux54-tkg/linux54-tkg-patches/0006-add-acs-overrides_iommu.patch new file mode 100644 index 0000000..7425de1 --- /dev/null +++ b/linux54-tkg/linux54-tkg-patches/0006-add-acs-overrides_iommu.patch @@ -0,0 +1,192 @@ +From cdeab384f48dd9c88e2dff2e9ad8d57dca1a1b1c Mon Sep 17 00:00:00 2001 +From: Mark Weiman +Date: Sun, 12 Aug 2018 11:36:21 -0400 +Subject: [PATCH] pci: Enable overrides for missing ACS capabilities + +This an updated version of Alex Williamson's patch from: +https://lkml.org/lkml/2013/5/30/513 + +Original commit message follows: + +PCIe ACS (Access Control Services) is the PCIe 2.0+ feature that +allows us to control whether transactions are allowed to be redirected +in various subnodes of a PCIe topology. For instance, if two +endpoints are below a root port or downsteam switch port, the +downstream port may optionally redirect transactions between the +devices, bypassing upstream devices. The same can happen internally +on multifunction devices. The transaction may never be visible to the +upstream devices. + +One upstream device that we particularly care about is the IOMMU. If +a redirection occurs in the topology below the IOMMU, then the IOMMU +cannot provide isolation between devices. This is why the PCIe spec +encourages topologies to include ACS support. Without it, we have to +assume peer-to-peer DMA within a hierarchy can bypass IOMMU isolation. + +Unfortunately, far too many topologies do not support ACS to make this +a steadfast requirement. Even the latest chipsets from Intel are only +sporadically supporting ACS. We have trouble getting interconnect +vendors to include the PCIe spec required PCIe capability, let alone +suggested features. + +Therefore, we need to add some flexibility. The pcie_acs_override= +boot option lets users opt-in specific devices or sets of devices to +assume ACS support. The "downstream" option assumes full ACS support +on root ports and downstream switch ports. The "multifunction" +option assumes the subset of ACS features available on multifunction +endpoints and upstream switch ports are supported. The "id:nnnn:nnnn" +option enables ACS support on devices matching the provided vendor +and device IDs, allowing more strategic ACS overrides. These options +may be combined in any order. A maximum of 16 id specific overrides +are available. It's suggested to use the most limited set of options +necessary to avoid completely disabling ACS across the topology. +Note to hardware vendors, we have facilities to permanently quirk +specific devices which enforce isolation but not provide an ACS +capability. Please contact me to have your devices added and save +your customers the hassle of this boot option. + +Signed-off-by: Mark Weiman +--- + .../admin-guide/kernel-parameters.txt | 9 ++ + drivers/pci/quirks.c | 101 ++++++++++++++++++ + 2 files changed, 110 insertions(+) + +diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt +index aefd358a5ca3..173b3596fd9e 100644 +--- a/Documentation/admin-guide/kernel-parameters.txt ++++ b/Documentation/admin-guide/kernel-parameters.txt +@@ -3190,6 +3190,15 @@ + nomsi [MSI] If the PCI_MSI kernel config parameter is + enabled, this kernel boot option can be used to + disable the use of MSI interrupts system-wide. ++ pcie_acs_override = ++ [PCIE] Override missing PCIe ACS support for: ++ downstream ++ All downstream ports - full ACS capabilities ++ multifunction ++ All multifunction devices - multifunction ACS subset ++ id:nnnn:nnnn ++ Specific device - full ACS capabilities ++ Specified as vid:did (vendor/device ID) in hex + noioapicquirk [APIC] Disable all boot interrupt quirks. + Safety option to keep boot IRQs enabled. This + should never be necessary. +diff --git a/drivers/pci/quirks.c b/drivers/pci/quirks.c +index 4700d24e5d55..8f7a3d7fd9c1 100644 +--- a/drivers/pci/quirks.c ++++ b/drivers/pci/quirks.c +@@ -3372,6 +3372,106 @@ static void quirk_no_bus_reset(struct pci_dev *dev) + dev->dev_flags |= PCI_DEV_FLAGS_NO_BUS_RESET; + } + ++static bool acs_on_downstream; ++static bool acs_on_multifunction; ++ ++#define NUM_ACS_IDS 16 ++struct acs_on_id { ++ unsigned short vendor; ++ unsigned short device; ++}; ++static struct acs_on_id acs_on_ids[NUM_ACS_IDS]; ++static u8 max_acs_id; ++ ++static __init int pcie_acs_override_setup(char *p) ++{ ++ if (!p) ++ return -EINVAL; ++ ++ while (*p) { ++ if (!strncmp(p, "downstream", 10)) ++ acs_on_downstream = true; ++ if (!strncmp(p, "multifunction", 13)) ++ acs_on_multifunction = true; ++ if (!strncmp(p, "id:", 3)) { ++ char opt[5]; ++ int ret; ++ long val; ++ ++ if (max_acs_id >= NUM_ACS_IDS - 1) { ++ pr_warn("Out of PCIe ACS override slots (%d)\n", ++ NUM_ACS_IDS); ++ goto next; ++ } ++ ++ p += 3; ++ snprintf(opt, 5, "%s", p); ++ ret = kstrtol(opt, 16, &val); ++ if (ret) { ++ pr_warn("PCIe ACS ID parse error %d\n", ret); ++ goto next; ++ } ++ acs_on_ids[max_acs_id].vendor = val; ++ ++ p += strcspn(p, ":"); ++ if (*p != ':') { ++ pr_warn("PCIe ACS invalid ID\n"); ++ goto next; ++ } ++ ++ p++; ++ snprintf(opt, 5, "%s", p); ++ ret = kstrtol(opt, 16, &val); ++ if (ret) { ++ pr_warn("PCIe ACS ID parse error %d\n", ret); ++ goto next; ++ } ++ acs_on_ids[max_acs_id].device = val; ++ max_acs_id++; ++ } ++next: ++ p += strcspn(p, ","); ++ if (*p == ',') ++ p++; ++ } ++ ++ if (acs_on_downstream || acs_on_multifunction || max_acs_id) ++ pr_warn("Warning: PCIe ACS overrides enabled; This may allow non-IOMMU protected peer-to-peer DMA\n"); ++ ++ return 0; ++} ++early_param("pcie_acs_override", pcie_acs_override_setup); ++ ++static int pcie_acs_overrides(struct pci_dev *dev, u16 acs_flags) ++{ ++ int i; ++ ++ /* Never override ACS for legacy devices or devices with ACS caps */ ++ if (!pci_is_pcie(dev) || ++ pci_find_ext_capability(dev, PCI_EXT_CAP_ID_ACS)) ++ return -ENOTTY; ++ ++ for (i = 0; i < max_acs_id; i++) ++ if (acs_on_ids[i].vendor == dev->vendor && ++ acs_on_ids[i].device == dev->device) ++ return 1; ++ ++ switch (pci_pcie_type(dev)) { ++ case PCI_EXP_TYPE_DOWNSTREAM: ++ case PCI_EXP_TYPE_ROOT_PORT: ++ if (acs_on_downstream) ++ return 1; ++ break; ++ case PCI_EXP_TYPE_ENDPOINT: ++ case PCI_EXP_TYPE_UPSTREAM: ++ case PCI_EXP_TYPE_LEG_END: ++ case PCI_EXP_TYPE_RC_END: ++ if (acs_on_multifunction && dev->multifunction) ++ return 1; ++ } ++ ++ return -ENOTTY; ++} + /* + * Some Atheros AR9xxx and QCA988x chips do not behave after a bus reset. + * The device will throw a Link Down error on AER-capable systems and +@@ -4513,6 +4613,7 @@ static const struct pci_dev_acs_enabled { + { PCI_VENDOR_ID_BROADCOM, 0xD714, pci_quirk_brcm_acs }, + /* Amazon Annapurna Labs */ + { PCI_VENDOR_ID_AMAZON_ANNAPURNA_LABS, 0x0031, pci_quirk_al_acs }, ++ { PCI_ANY_ID, PCI_ANY_ID, pcie_acs_overrides }, + { 0 } + }; + diff --git a/linux54-tkg/linux54-tkg-patches/0007-v5.4-fsync.patch b/linux54-tkg/linux54-tkg-patches/0007-v5.4-fsync.patch new file mode 100644 index 0000000..027116f --- /dev/null +++ b/linux54-tkg/linux54-tkg-patches/0007-v5.4-fsync.patch @@ -0,0 +1,419 @@ +split the futex key setup from the queue locking and key reading. This +is useful to support the setup of multiple keys at the same time, like +what is done in futex_requeue() and what will be done for the +FUTEX_WAIT_MULTIPLE command. + +Signed-off-by: Gabriel Krisman Bertazi +--- + kernel/futex.c | 71 +++++++++++++++++++++++++++++--------------------- + 1 file changed, 42 insertions(+), 29 deletions(-) + +diff --git a/kernel/futex.c b/kernel/futex.c +index 6d50728ef2e7..91f3db335c57 100644 +--- a/kernel/futex.c ++++ b/kernel/futex.c +@@ -2631,6 +2631,39 @@ static void futex_wait_queue_me(struct futex_hash_bucket *hb, struct futex_q *q, + __set_current_state(TASK_RUNNING); + } + ++static int __futex_wait_setup(u32 __user *uaddr, u32 val, unsigned int flags, ++ struct futex_q *q, struct futex_hash_bucket **hb) ++{ ++ ++ u32 uval; ++ int ret; ++ ++retry_private: ++ *hb = queue_lock(q); ++ ++ ret = get_futex_value_locked(&uval, uaddr); ++ ++ if (ret) { ++ queue_unlock(*hb); ++ ++ ret = get_user(uval, uaddr); ++ if (ret) ++ return ret; ++ ++ if (!(flags & FLAGS_SHARED)) ++ goto retry_private; ++ ++ return 1; ++ } ++ ++ if (uval != val) { ++ queue_unlock(*hb); ++ ret = -EWOULDBLOCK; ++ } ++ ++ return ret; ++} ++ + /** + * futex_wait_setup() - Prepare to wait on a futex + * @uaddr: the futex userspace address +@@ -2651,7 +2684,6 @@ static void futex_wait_queue_me(struct futex_hash_bucket *hb, struct futex_q *q, + static int futex_wait_setup(u32 __user *uaddr, u32 val, unsigned int flags, + struct futex_q *q, struct futex_hash_bucket **hb) + { +- u32 uval; + int ret; + + /* +@@ -2672,38 +2704,19 @@ static int futex_wait_setup(u32 __user *uaddr, u32 val, unsigned int flags, + * absorb a wakeup if *uaddr does not match the desired values + * while the syscall executes. + */ +-retry: +- ret = get_futex_key(uaddr, flags & FLAGS_SHARED, &q->key, FUTEX_READ); +- if (unlikely(ret != 0)) +- return ret; +- +-retry_private: +- *hb = queue_lock(q); ++ do { ++ ret = get_futex_key(uaddr, flags & FLAGS_SHARED, ++ &q->key, FUTEX_READ); ++ if (unlikely(ret != 0)) ++ return ret; + +- ret = get_futex_value_locked(&uval, uaddr); ++ ret = __futex_wait_setup(uaddr, val, flags, q, hb); + +- if (ret) { +- queue_unlock(*hb); +- +- ret = get_user(uval, uaddr); ++ /* Drop key reference if retry or error. */ + if (ret) +- goto out; ++ put_futex_key(&q->key); ++ } while (ret > 0); + +- if (!(flags & FLAGS_SHARED)) +- goto retry_private; +- +- put_futex_key(&q->key); +- goto retry; +- } +- +- if (uval != val) { +- queue_unlock(*hb); +- ret = -EWOULDBLOCK; +- } +- +-out: +- if (ret) +- put_futex_key(&q->key); + return ret; + } + +-- +2.20.1 + +This is a new futex operation, called FUTEX_WAIT_MULTIPLE, which allows +a thread to wait on several futexes at the same time, and be awoken by +any of them. In a sense, it implements one of the features that was +supported by pooling on the old FUTEX_FD interface. + +My use case for this operation lies in Wine, where we want to implement +a similar interface available in Windows, used mainly for event +handling. The wine folks have an implementation that uses eventfd, but +it suffers from FD exhaustion (I was told they have application that go +to the order of multi-milion FDs), and higher CPU utilization. + +In time, we are also proposing modifications to glibc and libpthread to +make this feature available for Linux native multithreaded applications +using libpthread, which can benefit from the behavior of waiting on any +of a group of futexes. + +In particular, using futexes in our Wine use case reduced the CPU +utilization by 4% for the game Beat Saber and by 1.5% for the game +Shadow of Tomb Raider, both running over Proton (a wine based solution +for Windows emulation), when compared to the eventfd interface. This +implementation also doesn't rely of file descriptors, so it doesn't risk +overflowing the resource. + +Technically, the existing FUTEX_WAIT implementation can be easily +reworked by using do_futex_wait_multiple with a count of one, and I +have a patch showing how it works. I'm not proposing it, since +futex is such a tricky code, that I'd be more confortable to have +FUTEX_WAIT_MULTIPLE running upstream for a couple development cycles, +before considering modifying FUTEX_WAIT. + +From an implementation perspective, the futex list is passed as an array +of (pointer,value,bitset) to the kernel, which will enqueue all of them +and sleep if none was already triggered. It returns a hint of which +futex caused the wake up event to userspace, but the hint doesn't +guarantee that is the only futex triggered. Before calling the syscall +again, userspace should traverse the list, trying to re-acquire any of +the other futexes, to prevent an immediate -EWOULDBLOCK return code from +the kernel. + +This was tested using three mechanisms: + +1) By reimplementing FUTEX_WAIT in terms of FUTEX_WAIT_MULTIPLE and +running the unmodified tools/testing/selftests/futex and a full linux +distro on top of this kernel. + +2) By an example code that exercises the FUTEX_WAIT_MULTIPLE path on a +multi-threaded, event-handling setup. + +3) By running the Wine fsync implementation and executing multi-threaded +applications, in particular the modern games mentioned above, on top of +this implementation. + +Signed-off-by: Zebediah Figura +Signed-off-by: Steven Noonan +Signed-off-by: Pierre-Loup A. Griffais +Signed-off-by: Gabriel Krisman Bertazi +--- + include/uapi/linux/futex.h | 7 ++ + kernel/futex.c | 161 ++++++++++++++++++++++++++++++++++++- + 2 files changed, 164 insertions(+), 4 deletions(-) + +diff --git a/include/uapi/linux/futex.h b/include/uapi/linux/futex.h +index a89eb0accd5e..2401c4cf5095 100644 +--- a/include/uapi/linux/futex.h ++++ b/include/uapi/linux/futex.h +@@ -21,6 +21,7 @@ + #define FUTEX_WAKE_BITSET 10 + #define FUTEX_WAIT_REQUEUE_PI 11 + #define FUTEX_CMP_REQUEUE_PI 12 ++#define FUTEX_WAIT_MULTIPLE 31 + + #define FUTEX_PRIVATE_FLAG 128 + #define FUTEX_CLOCK_REALTIME 256 +@@ -150,4 +151,10 @@ struct robust_list_head { + (((op & 0xf) << 28) | ((cmp & 0xf) << 24) \ + | ((oparg & 0xfff) << 12) | (cmparg & 0xfff)) + ++struct futex_wait_block { ++ __u32 __user *uaddr; ++ __u32 val; ++ __u32 bitset; ++}; ++ + #endif /* _UAPI_LINUX_FUTEX_H */ +diff --git a/kernel/futex.c b/kernel/futex.c +index 91f3db335c57..2623e8f152cd 100644 +--- a/kernel/futex.c ++++ b/kernel/futex.c +@@ -183,6 +183,7 @@ static int __read_mostly futex_cmpxchg_enabled; + #endif + #define FLAGS_CLOCKRT 0x02 + #define FLAGS_HAS_TIMEOUT 0x04 ++#define FLAGS_WAKE_MULTIPLE 0x08 + + /* + * Priority Inheritance state: +@@ -2720,6 +2721,150 @@ static int futex_wait_setup(u32 __user *uaddr, u32 val, unsigned int flags, + return ret; + } + ++static int do_futex_wait_multiple(struct futex_wait_block *wb, ++ u32 count, unsigned int flags, ++ ktime_t *abs_time) ++{ ++ ++ struct hrtimer_sleeper timeout, *to; ++ struct futex_hash_bucket *hb; ++ struct futex_q *qs = NULL; ++ int ret; ++ int i; ++ ++ qs = kcalloc(count, sizeof(struct futex_q), GFP_KERNEL); ++ if (!qs) ++ return -ENOMEM; ++ ++ to = futex_setup_timer(abs_time, &timeout, flags, ++ current->timer_slack_ns); ++ retry: ++ for (i = 0; i < count; i++) { ++ qs[i].key = FUTEX_KEY_INIT; ++ qs[i].bitset = wb[i].bitset; ++ ++ ret = get_futex_key(wb[i].uaddr, flags & FLAGS_SHARED, ++ &qs[i].key, FUTEX_READ); ++ if (unlikely(ret != 0)) { ++ for (--i; i >= 0; i--) ++ put_futex_key(&qs[i].key); ++ goto out; ++ } ++ } ++ ++ set_current_state(TASK_INTERRUPTIBLE); ++ ++ for (i = 0; i < count; i++) { ++ ret = __futex_wait_setup(wb[i].uaddr, wb[i].val, ++ flags, &qs[i], &hb); ++ if (ret) { ++ /* Drop the failed key directly. keys 0..(i-1) ++ * will be put by unqueue_me. ++ */ ++ put_futex_key(&qs[i].key); ++ ++ /* Undo the partial work we did. */ ++ for (--i; i >= 0; i--) ++ unqueue_me(&qs[i]); ++ ++ __set_current_state(TASK_RUNNING); ++ if (ret > 0) ++ goto retry; ++ goto out; ++ } ++ ++ /* We can't hold to the bucket lock when dealing with ++ * the next futex. Queue ourselves now so we can unlock ++ * it before moving on. ++ */ ++ queue_me(&qs[i], hb); ++ } ++ ++ if (to) ++ hrtimer_start_expires(&to->timer, HRTIMER_MODE_ABS); ++ ++ /* There is no easy to way to check if we are wake already on ++ * multiple futexes without waking through each one of them. So ++ * just sleep and let the scheduler handle it. ++ */ ++ if (!to || to->task) ++ freezable_schedule(); ++ ++ __set_current_state(TASK_RUNNING); ++ ++ ret = -ETIMEDOUT; ++ /* If we were woken (and unqueued), we succeeded. */ ++ for (i = 0; i < count; i++) ++ if (!unqueue_me(&qs[i])) ++ ret = i; ++ ++ /* Succeed wakeup */ ++ if (ret >= 0) ++ goto out; ++ ++ /* Woken by triggered timeout */ ++ if (to && !to->task) ++ goto out; ++ ++ /* ++ * We expect signal_pending(current), but we might be the ++ * victim of a spurious wakeup as well. ++ */ ++ if (!signal_pending(current)) ++ goto retry; ++ ++ ret = -ERESTARTSYS; ++ if (!abs_time) ++ goto out; ++ ++ ret = -ERESTART_RESTARTBLOCK; ++ out: ++ if (to) { ++ hrtimer_cancel(&to->timer); ++ destroy_hrtimer_on_stack(&to->timer); ++ } ++ ++ kfree(qs); ++ return ret; ++} ++ ++static int futex_wait_multiple(u32 __user *uaddr, unsigned int flags, ++ u32 count, ktime_t *abs_time) ++{ ++ struct futex_wait_block *wb; ++ struct restart_block *restart; ++ int ret; ++ ++ if (!count) ++ return -EINVAL; ++ ++ wb = kcalloc(count, sizeof(struct futex_wait_block), GFP_KERNEL); ++ if (!wb) ++ return -ENOMEM; ++ ++ if (copy_from_user(wb, uaddr, ++ count * sizeof(struct futex_wait_block))) { ++ ret = -EFAULT; ++ goto out; ++ } ++ ++ ret = do_futex_wait_multiple(wb, count, flags, abs_time); ++ ++ if (ret == -ERESTART_RESTARTBLOCK) { ++ restart = ¤t->restart_block; ++ restart->fn = futex_wait_restart; ++ restart->futex.uaddr = uaddr; ++ restart->futex.val = count; ++ restart->futex.time = *abs_time; ++ restart->futex.flags = (flags | FLAGS_HAS_TIMEOUT | ++ FLAGS_WAKE_MULTIPLE); ++ } ++ ++out: ++ kfree(wb); ++ return ret; ++} ++ + static int futex_wait(u32 __user *uaddr, unsigned int flags, u32 val, + ktime_t *abs_time, u32 bitset) + { +@@ -2797,6 +2942,10 @@ static long futex_wait_restart(struct restart_block *restart) + } + restart->fn = do_no_restart_syscall; + ++ if (restart->futex.flags & FLAGS_WAKE_MULTIPLE) ++ return (long)futex_wait_multiple(uaddr, restart->futex.flags, ++ restart->futex.val, tp); ++ + return (long)futex_wait(uaddr, restart->futex.flags, + restart->futex.val, tp, restart->futex.bitset); + } +@@ -3680,6 +3829,8 @@ long do_futex(u32 __user *uaddr, int op, u32 val, ktime_t *timeout, + uaddr2); + case FUTEX_CMP_REQUEUE_PI: + return futex_requeue(uaddr, flags, uaddr2, val, val2, &val3, 1); ++ case FUTEX_WAIT_MULTIPLE: ++ return futex_wait_multiple(uaddr, flags, val, timeout); + } + return -ENOSYS; + } +@@ -3696,7 +3847,8 @@ SYSCALL_DEFINE6(futex, u32 __user *, uaddr, int, op, u32, val, + + if (utime && (cmd == FUTEX_WAIT || cmd == FUTEX_LOCK_PI || + cmd == FUTEX_WAIT_BITSET || +- cmd == FUTEX_WAIT_REQUEUE_PI)) { ++ cmd == FUTEX_WAIT_REQUEUE_PI || ++ cmd == FUTEX_WAIT_MULTIPLE)) { + if (unlikely(should_fail_futex(!(op & FUTEX_PRIVATE_FLAG)))) + return -EFAULT; + if (get_timespec64(&ts, utime)) +@@ -3705,7 +3857,7 @@ SYSCALL_DEFINE6(futex, u32 __user *, uaddr, int, op, u32, val, + return -EINVAL; + + t = timespec64_to_ktime(ts); +- if (cmd == FUTEX_WAIT) ++ if (cmd == FUTEX_WAIT || cmd == FUTEX_WAIT_MULTIPLE) + t = ktime_add_safe(ktime_get(), t); + tp = &t; + } +@@ -3889,14 +4041,15 @@ SYSCALL_DEFINE6(futex_time32, u32 __user *, uaddr, int, op, u32, val, + + if (utime && (cmd == FUTEX_WAIT || cmd == FUTEX_LOCK_PI || + cmd == FUTEX_WAIT_BITSET || +- cmd == FUTEX_WAIT_REQUEUE_PI)) { ++ cmd == FUTEX_WAIT_REQUEUE_PI || ++ cmd == FUTEX_WAIT_MULTIPLE)) { + if (get_old_timespec32(&ts, utime)) + return -EFAULT; + if (!timespec64_valid(&ts)) + return -EINVAL; + + t = timespec64_to_ktime(ts); +- if (cmd == FUTEX_WAIT) ++ if (cmd == FUTEX_WAIT || cmd == FUTEX_WAIT_MULTIPLE) + t = ktime_add_safe(ktime_get(), t); + tp = &t; + } +-- +2.20.1 diff --git a/linux54-tkg/linux54-tkg-patches/0009-bmq_v5.4-r2.patch b/linux54-tkg/linux54-tkg-patches/0009-bmq_v5.4-r2.patch new file mode 100644 index 0000000..4d86ca6 --- /dev/null +++ b/linux54-tkg/linux54-tkg-patches/0009-bmq_v5.4-r2.patch @@ -0,0 +1,7601 @@ +diff --git a/Documentation/admin-guide/sysctl/kernel.rst b/Documentation/admin-guide/sysctl/kernel.rst +index 032c7cd3cede..97ea247cc43a 100644 +--- a/Documentation/admin-guide/sysctl/kernel.rst ++++ b/Documentation/admin-guide/sysctl/kernel.rst +@@ -105,6 +105,7 @@ show up in /proc/sys/kernel: + - unknown_nmi_panic + - watchdog + - watchdog_thresh ++- yield_type + - version + + +@@ -1175,3 +1176,13 @@ is 10 seconds. + + The softlockup threshold is (2 * watchdog_thresh). Setting this + tunable to zero will disable lockup detection altogether. ++ ++yield_type: ++=========== ++ ++BMQ CPU scheduler only. This determines what type of yield calls to ++sched_yield will perform. ++ ++ 0 - No yield. ++ 1 - Deboost and requeue task. (default) ++ 2 - Set run queue skip task. +diff --git a/Documentation/scheduler/sched-BMQ.txt b/Documentation/scheduler/sched-BMQ.txt +new file mode 100644 +index 000000000000..05c84eec0f31 +--- /dev/null ++++ b/Documentation/scheduler/sched-BMQ.txt +@@ -0,0 +1,110 @@ ++ BitMap queue CPU Scheduler ++ -------------------------- ++ ++CONTENT ++======== ++ ++ Background ++ Design ++ Overview ++ Task policy ++ Priority management ++ BitMap Queue ++ CPU Assignment and Migration ++ ++ ++Background ++========== ++ ++BitMap Queue CPU scheduler, referred to as BMQ from here on, is an evolution ++of previous Priority and Deadline based Skiplist multiple queue scheduler(PDS), ++and inspired by Zircon scheduler. The goal of it is to keep the scheduler code ++simple, while efficiency and scalable for interactive tasks, such as desktop, ++movie playback and gaming etc. ++ ++Design ++====== ++ ++Overview ++-------- ++ ++BMQ use per CPU run queue design, each CPU(logical) has it's own run queue, ++each CPU is responsible for scheduling the tasks that are putting into it's ++run queue. ++ ++The run queue is a set of priority queues. Note that these queues are fifo ++queue for non-rt tasks or priority queue for rt tasks in data structure. See ++BitMap Queue below for details. BMQ is optimized for non-rt tasks in the fact ++that most applications are non-rt tasks. No matter the queue is fifo or ++priority, In each queue is an ordered list of runnable tasks awaiting execution ++and the data structures are the same. When it is time for a new task to run, ++the scheduler simply looks the lowest numbered queueue that contains a task, ++and runs the first task from the head of that queue. And per CPU idle task is ++also in the run queue, so the scheduler can always find a task to run on from ++its run queue. ++ ++Each task will assigned the same timeslice(default 4ms) when it is picked to ++start running. Task will be reinserted at the end of the appropriate priority ++queue when it uses its whole timeslice. When the scheduler selects a new task ++from the priority queue it sets the CPU's preemption timer for the remainder of ++the previous timeslice. When that timer fires the scheduler will stop execution ++on that task, select another task and start over again. ++ ++If a task blocks waiting for a shared resource then it's taken out of its ++priority queue and is placed in a wait queue for the shared resource. When it ++is unblocked it will be reinserted in the appropriate priority queue of an ++eligible CPU. ++ ++Task policy ++----------- ++ ++BMQ supports DEADLINE, FIFO, RR, NORMAL, BATCH and IDLE task policy like the ++mainline CFS scheduler. But BMQ is heavy optimized for non-rt task, that's ++NORMAL/BATCH/IDLE policy tasks. Below is the implementation detail of each ++policy. ++ ++DEADLINE ++ It is squashed as priority 0 FIFO task. ++ ++FIFO/RR ++ All RT tasks share one single priority queue in BMQ run queue designed. The ++complexity of insert operation is O(n). BMQ is not designed for system runs ++with major rt policy tasks. ++ ++NORMAL/BATCH/IDLE ++ BATCH and IDLE tasks are treated as the same policy. They compete CPU with ++NORMAL policy tasks, but they just don't boost. To control the priority of ++NORMAL/BATCH/IDLE tasks, simply use nice level. ++ ++ISO ++ ISO policy is not supported in BMQ. Please use nice level -20 NORMAL policy ++task instead. ++ ++Priority management ++------------------- ++ ++RT tasks have priority from 0-99. For non-rt tasks, there are three different ++factors used to determine the effective priority of a task. The effective ++priority being what is used to determine which queue it will be in. ++ ++The first factor is simply the task’s static priority. Which is assigned from ++task's nice level, within [-20, 19] in userland's point of view and [0, 39] ++internally. ++ ++The second factor is the priority boost. This is a value bounded between ++[-MAX_PRIORITY_ADJ, MAX_PRIORITY_ADJ] used to offset the base priority, it is ++modified by the following cases: ++ ++*When a thread has used up its entire timeslice, always deboost its boost by ++increasing by one. ++*When a thread gives up cpu control(voluntary or non-voluntary) to reschedule, ++and its switch-in time(time after last switch and run) below the thredhold ++based on its priority boost, will boost its boost by decreasing by one buti is ++capped at 0 (won’t go negative). ++ ++The intent in this system is to ensure that interactive threads are serviced ++quickly. These are usually the threads that interact directly with the user ++and cause user-perceivable latency. These threads usually do little work and ++spend most of their time blocked awaiting another user event. So they get the ++priority boost from unblocking while background threads that do most of the ++processing receive the priority penalty for using their entire timeslice. +diff --git a/arch/powerpc/platforms/cell/spufs/sched.c b/arch/powerpc/platforms/cell/spufs/sched.c +index f18d5067cd0f..fe489fc01c73 100644 +--- a/arch/powerpc/platforms/cell/spufs/sched.c ++++ b/arch/powerpc/platforms/cell/spufs/sched.c +@@ -51,11 +51,6 @@ static struct task_struct *spusched_task; + static struct timer_list spusched_timer; + static struct timer_list spuloadavg_timer; + +-/* +- * Priority of a normal, non-rt, non-niced'd process (aka nice level 0). +- */ +-#define NORMAL_PRIO 120 +- + /* + * Frequency of the spu scheduler tick. By default we do one SPU scheduler + * tick for every 10 CPU scheduler ticks. +diff --git a/drivers/cpufreq/cpufreq_conservative.c b/drivers/cpufreq/cpufreq_conservative.c +index b66e81c06a57..a294f8f5fd75 100644 +--- a/drivers/cpufreq/cpufreq_conservative.c ++++ b/drivers/cpufreq/cpufreq_conservative.c +@@ -28,8 +28,8 @@ struct cs_dbs_tuners { + }; + + /* Conservative governor macros */ +-#define DEF_FREQUENCY_UP_THRESHOLD (80) +-#define DEF_FREQUENCY_DOWN_THRESHOLD (20) ++#define DEF_FREQUENCY_UP_THRESHOLD (63) ++#define DEF_FREQUENCY_DOWN_THRESHOLD (26) + #define DEF_FREQUENCY_STEP (5) + #define DEF_SAMPLING_DOWN_FACTOR (1) + #define MAX_SAMPLING_DOWN_FACTOR (10) +diff --git a/drivers/cpufreq/cpufreq_ondemand.c b/drivers/cpufreq/cpufreq_ondemand.c +index dced033875bf..d2cd03766b09 100644 +--- a/drivers/cpufreq/cpufreq_ondemand.c ++++ b/drivers/cpufreq/cpufreq_ondemand.c +@@ -18,7 +18,7 @@ + #include "cpufreq_ondemand.h" + + /* On-demand governor macros */ +-#define DEF_FREQUENCY_UP_THRESHOLD (80) ++#define DEF_FREQUENCY_UP_THRESHOLD (63) + #define DEF_SAMPLING_DOWN_FACTOR (1) + #define MAX_SAMPLING_DOWN_FACTOR (100000) + #define MICRO_FREQUENCY_UP_THRESHOLD (95) +@@ -127,7 +127,7 @@ static void dbs_freq_increase(struct cpufreq_policy *policy, unsigned int freq) + } + + /* +- * Every sampling_rate, we check, if current idle time is less than 20% ++ * Every sampling_rate, we check, if current idle time is less than 37% + * (default), then we try to increase frequency. Else, we adjust the frequency + * proportional to load. + */ +diff --git a/fs/proc/base.c b/fs/proc/base.c +index ebea9501afb8..51c9346a69fe 100644 +--- a/fs/proc/base.c ++++ b/fs/proc/base.c +@@ -477,7 +477,7 @@ static int proc_pid_schedstat(struct seq_file *m, struct pid_namespace *ns, + seq_puts(m, "0 0 0\n"); + else + seq_printf(m, "%llu %llu %lu\n", +- (unsigned long long)task->se.sum_exec_runtime, ++ (unsigned long long)tsk_seruntime(task), + (unsigned long long)task->sched_info.run_delay, + task->sched_info.pcount); + +diff --git a/include/asm-generic/resource.h b/include/asm-generic/resource.h +index 8874f681b056..59eb72bf7d5f 100644 +--- a/include/asm-generic/resource.h ++++ b/include/asm-generic/resource.h +@@ -23,7 +23,7 @@ + [RLIMIT_LOCKS] = { RLIM_INFINITY, RLIM_INFINITY }, \ + [RLIMIT_SIGPENDING] = { 0, 0 }, \ + [RLIMIT_MSGQUEUE] = { MQ_BYTES_MAX, MQ_BYTES_MAX }, \ +- [RLIMIT_NICE] = { 0, 0 }, \ ++ [RLIMIT_NICE] = { 30, 30 }, \ + [RLIMIT_RTPRIO] = { 0, 0 }, \ + [RLIMIT_RTTIME] = { RLIM_INFINITY, RLIM_INFINITY }, \ + } +diff --git a/include/linux/jiffies.h b/include/linux/jiffies.h +index 1b6d31da7cbc..dea181bdb1dd 100644 +--- a/include/linux/jiffies.h ++++ b/include/linux/jiffies.h +@@ -171,7 +171,7 @@ static inline u64 get_jiffies_64(void) + * Have the 32 bit jiffies value wrap 5 minutes after boot + * so jiffies wrap bugs show up earlier. + */ +-#define INITIAL_JIFFIES ((unsigned long)(unsigned int) (-300*HZ)) ++#define INITIAL_JIFFIES ((unsigned long)(unsigned int) (-10*HZ)) + + /* + * Change timeval to jiffies, trying to avoid the +diff --git a/include/linux/sched.h b/include/linux/sched.h +index 67a1d86981a9..a38ec88efbad 100644 +--- a/include/linux/sched.h ++++ b/include/linux/sched.h +@@ -644,13 +644,18 @@ struct task_struct { + unsigned int flags; + unsigned int ptrace; + +-#ifdef CONFIG_SMP ++#if defined(CONFIG_SMP) && !defined(CONFIG_SCHED_BMQ) + struct llist_node wake_entry; ++#endif ++#if defined(CONFIG_SMP) || defined(CONFIG_SCHED_BMQ) + int on_cpu; ++#endif ++#ifdef CONFIG_SMP + #ifdef CONFIG_THREAD_INFO_IN_TASK + /* Current CPU: */ + unsigned int cpu; + #endif ++#ifndef CONFIG_SCHED_BMQ + unsigned int wakee_flips; + unsigned long wakee_flip_decay_ts; + struct task_struct *last_wakee; +@@ -664,6 +669,7 @@ struct task_struct { + */ + int recent_used_cpu; + int wake_cpu; ++#endif /* !CONFIG_SCHED_BMQ */ + #endif + int on_rq; + +@@ -672,13 +678,23 @@ struct task_struct { + int normal_prio; + unsigned int rt_priority; + ++#ifdef CONFIG_SCHED_BMQ ++ u64 last_ran; ++ s64 time_slice; ++ int boost_prio; ++ int bmq_idx; ++ struct list_head bmq_node; ++ /* sched_clock time spent running */ ++ u64 sched_time; ++#else /* !CONFIG_SCHED_BMQ */ + const struct sched_class *sched_class; + struct sched_entity se; + struct sched_rt_entity rt; ++ struct sched_dl_entity dl; ++#endif + #ifdef CONFIG_CGROUP_SCHED + struct task_group *sched_task_group; + #endif +- struct sched_dl_entity dl; + + #ifdef CONFIG_UCLAMP_TASK + /* Clamp values requested for a scheduling entity */ +@@ -1283,6 +1299,15 @@ struct task_struct { + */ + }; + ++#ifdef CONFIG_SCHED_BMQ ++#define tsk_seruntime(t) ((t)->sched_time) ++/* replace the uncertian rt_timeout with 0UL */ ++#define tsk_rttimeout(t) (0UL) ++#else /* CFS */ ++#define tsk_seruntime(t) ((t)->se.sum_exec_runtime) ++#define tsk_rttimeout(t) ((t)->rt.timeout) ++#endif /* !CONFIG_SCHED_BMQ */ ++ + static inline struct pid *task_pid(struct task_struct *task) + { + return task->thread_pid; +diff --git a/include/linux/sched/deadline.h b/include/linux/sched/deadline.h +index 1aff00b65f3c..02a3c5d34ee4 100644 +--- a/include/linux/sched/deadline.h ++++ b/include/linux/sched/deadline.h +@@ -1,5 +1,22 @@ + /* SPDX-License-Identifier: GPL-2.0 */ + ++#ifdef CONFIG_SCHED_BMQ ++ ++#define __tsk_deadline(p) (0UL) ++ ++static inline int dl_prio(int prio) ++{ ++ return 0; ++} ++ ++static inline int dl_task(struct task_struct *p) ++{ ++ return (SCHED_NORMAL == p->policy); ++} ++#else ++ ++#define __tsk_deadline(p) ((p)->dl.deadline) ++ + /* + * SCHED_DEADLINE tasks has negative priorities, reflecting + * the fact that any of them has higher prio than RT and +@@ -19,6 +36,7 @@ static inline int dl_task(struct task_struct *p) + { + return dl_prio(p->prio); + } ++#endif /* CONFIG_SCHED_BMQ */ + + static inline bool dl_time_before(u64 a, u64 b) + { +diff --git a/include/linux/sched/prio.h b/include/linux/sched/prio.h +index 7d64feafc408..d9dc5d3ccd2e 100644 +--- a/include/linux/sched/prio.h ++++ b/include/linux/sched/prio.h +@@ -20,11 +20,17 @@ + */ + + #define MAX_USER_RT_PRIO 100 ++ + #define MAX_RT_PRIO MAX_USER_RT_PRIO + + #define MAX_PRIO (MAX_RT_PRIO + NICE_WIDTH) + #define DEFAULT_PRIO (MAX_RT_PRIO + NICE_WIDTH / 2) + ++#ifdef CONFIG_SCHED_BMQ ++/* +/- priority levels from the base priority */ ++#define MAX_PRIORITY_ADJ 4 ++#endif ++ + /* + * Convert user-nice values [ -20 ... 0 ... 19 ] + * to static priority [ MAX_RT_PRIO..MAX_PRIO-1 ], +diff --git a/include/linux/sched/rt.h b/include/linux/sched/rt.h +index e5af028c08b4..6387c8ea9832 100644 +--- a/include/linux/sched/rt.h ++++ b/include/linux/sched/rt.h +@@ -24,8 +24,10 @@ static inline bool task_is_realtime(struct task_struct *tsk) + + if (policy == SCHED_FIFO || policy == SCHED_RR) + return true; ++#ifndef CONFIG_SCHED_BMQ + if (policy == SCHED_DEADLINE) + return true; ++#endif + return false; + } + +diff --git a/include/linux/sched/task.h b/include/linux/sched/task.h +index 4b1c3b664f51..f0f966219695 100644 +--- a/include/linux/sched/task.h ++++ b/include/linux/sched/task.h +@@ -99,7 +99,7 @@ extern long kernel_wait4(pid_t, int __user *, int, struct rusage *); + extern void free_task(struct task_struct *tsk); + + /* sched_exec is called by processes performing an exec */ +-#ifdef CONFIG_SMP ++#if defined(CONFIG_SMP) && !defined(CONFIG_SCHED_BMQ) + extern void sched_exec(void); + #else + #define sched_exec() {} +diff --git a/init/Kconfig b/init/Kconfig +index b4daad2bac23..f9faeb82f677 100644 +--- a/init/Kconfig ++++ b/init/Kconfig +@@ -717,9 +717,28 @@ config GENERIC_SCHED_CLOCK + + menu "Scheduler features" + ++config SCHED_BMQ ++ bool "BMQ CPU scheduler" ++ help ++ The BitMap Queue CPU scheduler for excellent interactivity and ++ responsiveness on the desktop and solid scalability on normal ++ hardware and commodity servers. ++ ++ Say Y here. ++ default y ++ ++config SCHED_TIMESLICE ++ int "Scheduler Task time slice" ++ depends on SCHED_BMQ ++ help ++ Time slice in ms for BMQ CPU scheduler, default 4 ms. ++ default 2 if PREEMPT ++ default 4 if !PREEMPT ++ + config UCLAMP_TASK + bool "Enable utilization clamping for RT/FAIR tasks" + depends on CPU_FREQ_GOV_SCHEDUTIL ++ depends on !SCHED_BMQ + help + This feature enables the scheduler to track the clamped utilization + of each CPU based on RUNNABLE tasks scheduled on that CPU. +@@ -802,6 +821,7 @@ config NUMA_BALANCING + depends on ARCH_SUPPORTS_NUMA_BALANCING + depends on !ARCH_WANT_NUMA_VARIABLE_LOCALITY + depends on SMP && NUMA && MIGRATION ++ depends on !SCHED_BMQ + help + This option adds support for automatic NUMA aware memory/task placement. + The mechanism is quite primitive and is based on migrating memory when +@@ -903,7 +923,7 @@ menuconfig CGROUP_SCHED + bandwidth allocation to such task groups. It uses cgroups to group + tasks. + +-if CGROUP_SCHED ++if CGROUP_SCHED && !SCHED_BMQ + config FAIR_GROUP_SCHED + bool "Group scheduling for SCHED_OTHER" + depends on CGROUP_SCHED +@@ -1150,6 +1170,7 @@ config CHECKPOINT_RESTORE + + config SCHED_AUTOGROUP + bool "Automatic process group scheduling" ++ depends on !SCHED_BMQ + select CGROUPS + select CGROUP_SCHED + select FAIR_GROUP_SCHED +diff --git a/init/init_task.c b/init/init_task.c +index 9e5cbe5eab7b..c293de91d90f 100644 +--- a/init/init_task.c ++++ b/init/init_task.c +@@ -66,9 +66,15 @@ struct task_struct init_task + .stack = init_stack, + .usage = REFCOUNT_INIT(2), + .flags = PF_KTHREAD, ++#ifdef CONFIG_SCHED_BMQ ++ .prio = DEFAULT_PRIO + MAX_PRIORITY_ADJ, ++ .static_prio = DEFAULT_PRIO, ++ .normal_prio = DEFAULT_PRIO + MAX_PRIORITY_ADJ, ++#else + .prio = MAX_PRIO - 20, + .static_prio = MAX_PRIO - 20, + .normal_prio = MAX_PRIO - 20, ++#endif + .policy = SCHED_NORMAL, + .cpus_ptr = &init_task.cpus_mask, + .cpus_mask = CPU_MASK_ALL, +@@ -78,6 +84,12 @@ struct task_struct init_task + .restart_block = { + .fn = do_no_restart_syscall, + }, ++#ifdef CONFIG_SCHED_BMQ ++ .boost_prio = 0, ++ .bmq_idx = 15, ++ .bmq_node = LIST_HEAD_INIT(init_task.bmq_node), ++ .time_slice = HZ, ++#else + .se = { + .group_node = LIST_HEAD_INIT(init_task.se.group_node), + }, +@@ -85,6 +97,7 @@ struct task_struct init_task + .run_list = LIST_HEAD_INIT(init_task.rt.run_list), + .time_slice = RR_TIMESLICE, + }, ++#endif + .tasks = LIST_HEAD_INIT(init_task.tasks), + #ifdef CONFIG_SMP + .pushable_tasks = PLIST_NODE_INIT(init_task.pushable_tasks, MAX_PRIO), +diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c +index c87ee6412b36..45fac7b9c940 100644 +--- a/kernel/cgroup/cpuset.c ++++ b/kernel/cgroup/cpuset.c +@@ -632,7 +632,7 @@ static int validate_change(struct cpuset *cur, struct cpuset *trial) + return ret; + } + +-#ifdef CONFIG_SMP ++#if defined(CONFIG_SMP) && !defined(CONFIG_SCHED_BMQ) + /* + * Helper routine for generate_sched_domains(). + * Do cpusets a, b have overlapping effective cpus_allowed masks? +@@ -1007,7 +1007,7 @@ static void rebuild_sched_domains_locked(void) + /* Have scheduler rebuild the domains */ + partition_and_rebuild_sched_domains(ndoms, doms, attr); + } +-#else /* !CONFIG_SMP */ ++#else /* !CONFIG_SMP || CONFIG_SCHED_BMQ */ + static void rebuild_sched_domains_locked(void) + { + } +diff --git a/kernel/delayacct.c b/kernel/delayacct.c +index 27725754ac99..769d773c7182 100644 +--- a/kernel/delayacct.c ++++ b/kernel/delayacct.c +@@ -106,7 +106,7 @@ int __delayacct_add_tsk(struct taskstats *d, struct task_struct *tsk) + */ + t1 = tsk->sched_info.pcount; + t2 = tsk->sched_info.run_delay; +- t3 = tsk->se.sum_exec_runtime; ++ t3 = tsk_seruntime(tsk); + + d->cpu_count += t1; + +diff --git a/kernel/exit.c b/kernel/exit.c +index a46a50d67002..58043176b285 100644 +--- a/kernel/exit.c ++++ b/kernel/exit.c +@@ -131,7 +131,7 @@ static void __exit_signal(struct task_struct *tsk) + sig->curr_target = next_thread(tsk); + } + +- add_device_randomness((const void*) &tsk->se.sum_exec_runtime, ++ add_device_randomness((const void*) &tsk_seruntime(tsk), + sizeof(unsigned long long)); + + /* +@@ -152,7 +152,7 @@ static void __exit_signal(struct task_struct *tsk) + sig->inblock += task_io_get_inblock(tsk); + sig->oublock += task_io_get_oublock(tsk); + task_io_accounting_add(&sig->ioac, &tsk->ioac); +- sig->sum_sched_runtime += tsk->se.sum_exec_runtime; ++ sig->sum_sched_runtime += tsk_seruntime(tsk); + sig->nr_threads--; + __unhash_process(tsk, group_dead); + write_sequnlock(&sig->stats_lock); +diff --git a/kernel/livepatch/transition.c b/kernel/livepatch/transition.c +index cdf318d86dd6..b3bd1e65c002 100644 +--- a/kernel/livepatch/transition.c ++++ b/kernel/livepatch/transition.c +@@ -306,7 +306,11 @@ static bool klp_try_switch_task(struct task_struct *task) + */ + rq = task_rq_lock(task, &flags); + ++#ifdef CONFIG_SCHED_BMQ ++ if (task_running(task) && task != current) { ++#else + if (task_running(rq, task) && task != current) { ++#endif + snprintf(err_buf, STACK_ERR_BUF_SIZE, + "%s: %s:%d is running\n", __func__, task->comm, + task->pid); +diff --git a/kernel/locking/rtmutex.c b/kernel/locking/rtmutex.c +index 2874bf556162..fad8a279fdfa 100644 +--- a/kernel/locking/rtmutex.c ++++ b/kernel/locking/rtmutex.c +@@ -229,7 +229,7 @@ static inline bool unlock_rt_mutex_safe(struct rt_mutex *lock, + * Only use with rt_mutex_waiter_{less,equal}() + */ + #define task_to_waiter(p) \ +- &(struct rt_mutex_waiter){ .prio = (p)->prio, .deadline = (p)->dl.deadline } ++ &(struct rt_mutex_waiter){ .prio = (p)->prio, .deadline = __tsk_deadline(p) } + + static inline int + rt_mutex_waiter_less(struct rt_mutex_waiter *left, +@@ -680,7 +680,7 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task, + * the values of the node being removed. + */ + waiter->prio = task->prio; +- waiter->deadline = task->dl.deadline; ++ waiter->deadline = __tsk_deadline(task); + + rt_mutex_enqueue(lock, waiter); + +@@ -953,7 +953,7 @@ static int task_blocks_on_rt_mutex(struct rt_mutex *lock, + waiter->task = task; + waiter->lock = lock; + waiter->prio = task->prio; +- waiter->deadline = task->dl.deadline; ++ waiter->deadline = __tsk_deadline(task); + + /* Get the top priority waiter on the lock */ + if (rt_mutex_has_waiters(lock)) +diff --git a/kernel/sched/Makefile b/kernel/sched/Makefile +index 21fb5a5662b5..cab4e5c5b38e 100644 +--- a/kernel/sched/Makefile ++++ b/kernel/sched/Makefile +@@ -16,14 +16,20 @@ ifneq ($(CONFIG_SCHED_OMIT_FRAME_POINTER),y) + CFLAGS_core.o := $(PROFILING) -fno-omit-frame-pointer + endif + +-obj-y += core.o loadavg.o clock.o cputime.o +-obj-y += idle.o fair.o rt.o deadline.o +-obj-y += wait.o wait_bit.o swait.o completion.o +- +-obj-$(CONFIG_SMP) += cpupri.o cpudeadline.o topology.o stop_task.o pelt.o ++ifdef CONFIG_SCHED_BMQ ++obj-y += bmq.o ++else ++obj-y += core.o ++obj-y += fair.o rt.o deadline.o ++obj-$(CONFIG_SMP) += cpudeadline.o topology.o stop_task.o + obj-$(CONFIG_SCHED_AUTOGROUP) += autogroup.o +-obj-$(CONFIG_SCHEDSTATS) += stats.o + obj-$(CONFIG_SCHED_DEBUG) += debug.o ++endif ++obj-y += loadavg.o clock.o cputime.o ++obj-y += idle.o ++obj-y += wait.o wait_bit.o swait.o completion.o ++obj-$(CONFIG_SMP) += cpupri.o pelt.o ++obj-$(CONFIG_SCHEDSTATS) += stats.o + obj-$(CONFIG_CGROUP_CPUACCT) += cpuacct.o + obj-$(CONFIG_CPU_FREQ) += cpufreq.o + obj-$(CONFIG_CPU_FREQ_GOV_SCHEDUTIL) += cpufreq_schedutil.o +diff --git a/kernel/sched/bmq.c b/kernel/sched/bmq.c +new file mode 100644 +index 000000000000..42a2a5b3d172 +--- /dev/null ++++ b/kernel/sched/bmq.c +@@ -0,0 +1,6102 @@ ++/* ++ * kernel/sched/bmq.c ++ * ++ * BMQ Core kernel scheduler code and related syscalls ++ * ++ * Copyright (C) 1991-2002 Linus Torvalds ++ * ++ * 2009-08-13 Brainfuck deadline scheduling policy by Con Kolivas deletes ++ * a whole lot of those previous things. ++ * 2017-09-06 Priority and Deadline based Skip list multiple queue kernel ++ * scheduler by Alfred Chen. ++ * 2019-02-20 BMQ(BitMap Queue) kernel scheduler by Alfred Chen. ++ */ ++#include "bmq_sched.h" ++ ++#include ++ ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++ ++#include ++ ++#include ++ ++#include "../workqueue_internal.h" ++#include "../smpboot.h" ++ ++#include "pelt.h" ++ ++#define CREATE_TRACE_POINTS ++#include ++ ++/* rt_prio(prio) defined in include/linux/sched/rt.h */ ++#define rt_task(p) rt_prio((p)->prio) ++#define rt_policy(policy) ((policy) == SCHED_FIFO || (policy) == SCHED_RR) ++#define task_has_rt_policy(p) (rt_policy((p)->policy)) ++ ++#define STOP_PRIO (MAX_RT_PRIO - 1) ++ ++#define SCHED_TIMESLICE_NS (CONFIG_SCHED_TIMESLICE * 1000 * 1000) ++ ++/* Reschedule if less than this many μs left */ ++#define RESCHED_NS (100 * 1000) ++ ++/* ++ * This allows printing both to /proc/sched_debug and ++ * to the console ++ */ ++#define SEQ_printf(m, x...) \ ++ do { \ ++ if (m) \ ++ seq_printf(m, x); \ ++ else \ ++ pr_cont(x); \ ++ } while (0) ++ ++static inline void print_scheduler_version(void) ++{ ++ printk(KERN_INFO "bmq: BMQ CPU Scheduler 5.4-r2 by Alfred Chen.\n"); ++} ++ ++/** ++ * sched_yield_type - Choose what sort of yield sched_yield will perform. ++ * 0: No yield. ++ * 1: Deboost and requeue task. (default) ++ * 2: Set rq skip task. ++ */ ++int sched_yield_type __read_mostly = 1; ++ ++#define rq_switch_time(rq) ((rq)->clock - (rq)->last_ts_switch) ++#define boost_threshold(p) (SCHED_TIMESLICE_NS >>\ ++ (10 - MAX_PRIORITY_ADJ - (p)->boost_prio)) ++ ++static inline void boost_task(struct task_struct *p) ++{ ++ int limit; ++ ++ switch (p->policy) { ++ case SCHED_NORMAL: ++ limit = -MAX_PRIORITY_ADJ; ++ break; ++ case SCHED_BATCH: ++ case SCHED_IDLE: ++ limit = 0; ++ break; ++ default: ++ return; ++ } ++ ++ if (p->boost_prio > limit) ++ p->boost_prio--; ++} ++ ++static inline void deboost_task(struct task_struct *p) ++{ ++ if (p->boost_prio < MAX_PRIORITY_ADJ) ++ p->boost_prio++; ++} ++ ++#ifdef CONFIG_SMP ++static cpumask_t sched_rq_pending_mask ____cacheline_aligned_in_smp; ++ ++enum { ++ BASE_CPU_AFFINITY_CHK_LEVEL = 1, ++#ifdef CONFIG_SCHED_SMT ++ SMT_CPU_AFFINITY_CHK_LEVEL_SPACE_HOLDER, ++#endif ++#ifdef CONFIG_SCHED_MC ++ MC_CPU_AFFINITY_CHK_LEVEL_SPACE_HOLDER, ++#endif ++ NR_CPU_AFFINITY_CHK_LEVEL ++}; ++ ++DEFINE_PER_CPU(cpumask_t [NR_CPU_AFFINITY_CHK_LEVEL], sched_cpu_affinity_masks); ++DEFINE_PER_CPU(cpumask_t *, sched_cpu_affinity_end_mask); ++ ++#ifdef CONFIG_SCHED_SMT ++DEFINE_STATIC_KEY_FALSE(sched_smt_present); ++EXPORT_SYMBOL_GPL(sched_smt_present); ++#endif ++ ++/* ++ * Keep a unique ID per domain (we use the first CPUs number in the cpumask of ++ * the domain), this allows us to quickly tell if two cpus are in the same cache ++ * domain, see cpus_share_cache(). ++ */ ++DEFINE_PER_CPU(int, sd_llc_id); ++ ++int __weak arch_sd_sibling_asym_packing(void) ++{ ++ return 0*SD_ASYM_PACKING; ++} ++#endif /* CONFIG_SMP */ ++ ++static DEFINE_MUTEX(sched_hotcpu_mutex); ++ ++DEFINE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues); ++ ++#ifndef prepare_arch_switch ++# define prepare_arch_switch(next) do { } while (0) ++#endif ++#ifndef finish_arch_post_lock_switch ++# define finish_arch_post_lock_switch() do { } while (0) ++#endif ++ ++#define IDLE_WM (IDLE_TASK_SCHED_PRIO) ++ ++static cpumask_t sched_sg_idle_mask ____cacheline_aligned_in_smp; ++static cpumask_t sched_rq_watermark[bmq_BITS] ____cacheline_aligned_in_smp; ++ ++#if (bmq_BITS <= BITS_PER_LONG) ++#define bmq_find_first_bit(bm) __ffs((bm[0])) ++#define bmq_find_next_bit(bm, start) __ffs(BITMAP_FIRST_WORD_MASK(start) & bm[0]) ++#else ++#define bmq_find_first_bit(bm) find_first_bit((bm), bmq_BITS) ++#define bmq_find_next_bit(bm, start) find_next_bit(bm, bmq_BITS, start) ++#endif ++ ++static inline void update_sched_rq_watermark(struct rq *rq) ++{ ++ unsigned long watermark = bmq_find_first_bit(rq->queue.bitmap); ++ unsigned long last_wm = rq->watermark; ++ unsigned long i; ++ int cpu; ++ ++ if (watermark == last_wm) ++ return; ++ ++ rq->watermark = watermark; ++ cpu = cpu_of(rq); ++ if (watermark < last_wm) { ++ for (i = watermark + 1; i <= last_wm; i++) ++ cpumask_andnot(&sched_rq_watermark[i], ++ &sched_rq_watermark[i], cpumask_of(cpu)); ++#ifdef CONFIG_SCHED_SMT ++ if (!static_branch_likely(&sched_smt_present)) ++ return; ++ if (IDLE_WM == last_wm) ++ cpumask_andnot(&sched_sg_idle_mask, ++ &sched_sg_idle_mask, cpu_smt_mask(cpu)); ++#endif ++ return; ++ } ++ /* last_wm < watermark */ ++ for (i = last_wm + 1; i <= watermark; i++) ++ cpumask_set_cpu(cpu, &sched_rq_watermark[i]); ++#ifdef CONFIG_SCHED_SMT ++ if (!static_branch_likely(&sched_smt_present)) ++ return; ++ if (IDLE_WM == watermark) { ++ cpumask_t tmp; ++ cpumask_and(&tmp, cpu_smt_mask(cpu), &sched_rq_watermark[IDLE_WM]); ++ if (cpumask_equal(&tmp, cpu_smt_mask(cpu))) ++ cpumask_or(&sched_sg_idle_mask, cpu_smt_mask(cpu), ++ &sched_sg_idle_mask); ++ } ++#endif ++} ++ ++static inline int task_sched_prio(struct task_struct *p) ++{ ++ return (p->prio < MAX_RT_PRIO)? 0:p->prio - MAX_RT_PRIO + p->boost_prio + 1; ++} ++ ++static inline void bmq_init(struct bmq *q) ++{ ++ int i; ++ ++ bitmap_zero(q->bitmap, bmq_BITS); ++ for(i = 0; i < bmq_BITS; i++) ++ INIT_LIST_HEAD(&q->heads[i]); ++} ++ ++static inline void bmq_init_idle(struct bmq *q, struct task_struct *idle) ++{ ++ INIT_LIST_HEAD(&q->heads[IDLE_TASK_SCHED_PRIO]); ++ list_add(&idle->bmq_node, &q->heads[IDLE_TASK_SCHED_PRIO]); ++ set_bit(IDLE_TASK_SCHED_PRIO, q->bitmap); ++} ++ ++static inline void bmq_add_task(struct task_struct *p, struct bmq *q, int idx) ++{ ++ struct list_head *n; ++ ++ if (likely(idx)) { ++ list_add_tail(&p->bmq_node, &q->heads[idx]); ++ return; ++ } ++ ++ list_for_each(n, &q->heads[idx]) ++ if (list_entry(n, struct task_struct, bmq_node)->prio > p->prio) ++ break; ++ __list_add(&p->bmq_node, n->prev, n); ++} ++ ++/* ++ * This routine used in bmq scheduler only which assume the idle task in the bmq ++ */ ++static inline struct task_struct *rq_first_bmq_task(struct rq *rq) ++{ ++ unsigned long idx = bmq_find_first_bit(rq->queue.bitmap); ++ const struct list_head *head = &rq->queue.heads[idx]; ++ ++ return list_first_entry(head, struct task_struct, bmq_node); ++} ++ ++static inline struct task_struct * ++rq_next_bmq_task(struct task_struct *p, struct rq *rq) ++{ ++ unsigned long idx = p->bmq_idx; ++ struct list_head *head = &rq->queue.heads[idx]; ++ ++ if (list_is_last(&p->bmq_node, head)) { ++ idx = bmq_find_next_bit(rq->queue.bitmap, idx + 1); ++ head = &rq->queue.heads[idx]; ++ ++ return list_first_entry(head, struct task_struct, bmq_node); ++ } ++ ++ return list_next_entry(p, bmq_node); ++} ++ ++static inline struct task_struct *rq_runnable_task(struct rq *rq) ++{ ++ struct task_struct *next = rq_first_bmq_task(rq); ++ ++ if (unlikely(next == rq->skip)) ++ next = rq_next_bmq_task(next, rq); ++ ++ return next; ++} ++ ++/* ++ * Context: p->pi_lock ++ */ ++static inline struct rq ++*__task_access_lock(struct task_struct *p, raw_spinlock_t **plock) ++{ ++ struct rq *rq; ++ for (;;) { ++ rq = task_rq(p); ++ if (p->on_cpu || task_on_rq_queued(p)) { ++ raw_spin_lock(&rq->lock); ++ if (likely((p->on_cpu || task_on_rq_queued(p)) ++ && rq == task_rq(p))) { ++ *plock = &rq->lock; ++ return rq; ++ } ++ raw_spin_unlock(&rq->lock); ++ } else if (task_on_rq_migrating(p)) { ++ do { ++ cpu_relax(); ++ } while (unlikely(task_on_rq_migrating(p))); ++ } else { ++ *plock = NULL; ++ return rq; ++ } ++ } ++} ++ ++static inline void ++__task_access_unlock(struct task_struct *p, raw_spinlock_t *lock) ++{ ++ if (NULL != lock) ++ raw_spin_unlock(lock); ++} ++ ++static inline struct rq ++*task_access_lock_irqsave(struct task_struct *p, raw_spinlock_t **plock, ++ unsigned long *flags) ++{ ++ struct rq *rq; ++ for (;;) { ++ rq = task_rq(p); ++ if (p->on_cpu || task_on_rq_queued(p)) { ++ raw_spin_lock_irqsave(&rq->lock, *flags); ++ if (likely((p->on_cpu || task_on_rq_queued(p)) ++ && rq == task_rq(p))) { ++ *plock = &rq->lock; ++ return rq; ++ } ++ raw_spin_unlock_irqrestore(&rq->lock, *flags); ++ } else if (task_on_rq_migrating(p)) { ++ do { ++ cpu_relax(); ++ } while (unlikely(task_on_rq_migrating(p))); ++ } else { ++ raw_spin_lock_irqsave(&p->pi_lock, *flags); ++ if (likely(!p->on_cpu && !p->on_rq && ++ rq == task_rq(p))) { ++ *plock = &p->pi_lock; ++ return rq; ++ } ++ raw_spin_unlock_irqrestore(&p->pi_lock, *flags); ++ } ++ } ++} ++ ++static inline void ++task_access_unlock_irqrestore(struct task_struct *p, raw_spinlock_t *lock, ++ unsigned long *flags) ++{ ++ raw_spin_unlock_irqrestore(lock, *flags); ++} ++ ++/* ++ * __task_rq_lock - lock the rq @p resides on. ++ */ ++struct rq *__task_rq_lock(struct task_struct *p, struct rq_flags *rf) ++ __acquires(rq->lock) ++{ ++ struct rq *rq; ++ ++ lockdep_assert_held(&p->pi_lock); ++ ++ for (;;) { ++ rq = task_rq(p); ++ raw_spin_lock(&rq->lock); ++ if (likely(rq == task_rq(p) && !task_on_rq_migrating(p))) ++ return rq; ++ raw_spin_unlock(&rq->lock); ++ ++ while (unlikely(task_on_rq_migrating(p))) ++ cpu_relax(); ++ } ++} ++ ++/* ++ * task_rq_lock - lock p->pi_lock and lock the rq @p resides on. ++ */ ++struct rq *task_rq_lock(struct task_struct *p, struct rq_flags *rf) ++ __acquires(p->pi_lock) ++ __acquires(rq->lock) ++{ ++ struct rq *rq; ++ ++ for (;;) { ++ raw_spin_lock_irqsave(&p->pi_lock, rf->flags); ++ rq = task_rq(p); ++ raw_spin_lock(&rq->lock); ++ /* ++ * move_queued_task() task_rq_lock() ++ * ++ * ACQUIRE (rq->lock) ++ * [S] ->on_rq = MIGRATING [L] rq = task_rq() ++ * WMB (__set_task_cpu()) ACQUIRE (rq->lock); ++ * [S] ->cpu = new_cpu [L] task_rq() ++ * [L] ->on_rq ++ * RELEASE (rq->lock) ++ * ++ * If we observe the old CPU in task_rq_lock(), the acquire of ++ * the old rq->lock will fully serialize against the stores. ++ * ++ * If we observe the new CPU in task_rq_lock(), the address ++ * dependency headed by '[L] rq = task_rq()' and the acquire ++ * will pair with the WMB to ensure we then also see migrating. ++ */ ++ if (likely(rq == task_rq(p) && !task_on_rq_migrating(p))) { ++ return rq; ++ } ++ raw_spin_unlock(&rq->lock); ++ raw_spin_unlock_irqrestore(&p->pi_lock, rf->flags); ++ ++ while (unlikely(task_on_rq_migrating(p))) ++ cpu_relax(); ++ } ++} ++ ++/* ++ * RQ-clock updating methods: ++ */ ++ ++static void update_rq_clock_task(struct rq *rq, s64 delta) ++{ ++/* ++ * In theory, the compile should just see 0 here, and optimize out the call ++ * to sched_rt_avg_update. But I don't trust it... ++ */ ++ s64 __maybe_unused steal = 0, irq_delta = 0; ++ ++#ifdef CONFIG_IRQ_TIME_ACCOUNTING ++ irq_delta = irq_time_read(cpu_of(rq)) - rq->prev_irq_time; ++ ++ /* ++ * Since irq_time is only updated on {soft,}irq_exit, we might run into ++ * this case when a previous update_rq_clock() happened inside a ++ * {soft,}irq region. ++ * ++ * When this happens, we stop ->clock_task and only update the ++ * prev_irq_time stamp to account for the part that fit, so that a next ++ * update will consume the rest. This ensures ->clock_task is ++ * monotonic. ++ * ++ * It does however cause some slight miss-attribution of {soft,}irq ++ * time, a more accurate solution would be to update the irq_time using ++ * the current rq->clock timestamp, except that would require using ++ * atomic ops. ++ */ ++ if (irq_delta > delta) ++ irq_delta = delta; ++ ++ rq->prev_irq_time += irq_delta; ++ delta -= irq_delta; ++#endif ++#ifdef CONFIG_PARAVIRT_TIME_ACCOUNTING ++ if (static_key_false((¶virt_steal_rq_enabled))) { ++ steal = paravirt_steal_clock(cpu_of(rq)); ++ steal -= rq->prev_steal_time_rq; ++ ++ if (unlikely(steal > delta)) ++ steal = delta; ++ ++ rq->prev_steal_time_rq += steal; ++ delta -= steal; ++ } ++#endif ++ ++ rq->clock_task += delta; ++ ++#ifdef CONFIG_HAVE_SCHED_AVG_IRQ ++ if ((irq_delta + steal)) ++ update_irq_load_avg(rq, irq_delta + steal); ++#endif ++} ++ ++static inline void update_rq_clock(struct rq *rq) ++{ ++ s64 delta = sched_clock_cpu(cpu_of(rq)) - rq->clock; ++ ++ if (unlikely(delta <= 0)) ++ return; ++ rq->clock += delta; ++ update_rq_clock_task(rq, delta); ++} ++ ++/* ++ * cmpxchg based fetch_or, macro so it works for different integer types ++ */ ++#define fetch_or(ptr, mask) \ ++ ({ \ ++ typeof(ptr) _ptr = (ptr); \ ++ typeof(mask) _mask = (mask); \ ++ typeof(*_ptr) _old, _val = *_ptr; \ ++ \ ++ for (;;) { \ ++ _old = cmpxchg(_ptr, _val, _val | _mask); \ ++ if (_old == _val) \ ++ break; \ ++ _val = _old; \ ++ } \ ++ _old; \ ++}) ++ ++#if defined(CONFIG_SMP) && defined(TIF_POLLING_NRFLAG) ++/* ++ * Atomically set TIF_NEED_RESCHED and test for TIF_POLLING_NRFLAG, ++ * this avoids any races wrt polling state changes and thereby avoids ++ * spurious IPIs. ++ */ ++static bool set_nr_and_not_polling(struct task_struct *p) ++{ ++ struct thread_info *ti = task_thread_info(p); ++ return !(fetch_or(&ti->flags, _TIF_NEED_RESCHED) & _TIF_POLLING_NRFLAG); ++} ++ ++/* ++ * Atomically set TIF_NEED_RESCHED if TIF_POLLING_NRFLAG is set. ++ * ++ * If this returns true, then the idle task promises to call ++ * sched_ttwu_pending() and reschedule soon. ++ */ ++static bool set_nr_if_polling(struct task_struct *p) ++{ ++ struct thread_info *ti = task_thread_info(p); ++ typeof(ti->flags) old, val = READ_ONCE(ti->flags); ++ ++ for (;;) { ++ if (!(val & _TIF_POLLING_NRFLAG)) ++ return false; ++ if (val & _TIF_NEED_RESCHED) ++ return true; ++ old = cmpxchg(&ti->flags, val, val | _TIF_NEED_RESCHED); ++ if (old == val) ++ break; ++ val = old; ++ } ++ return true; ++} ++ ++#else ++static bool set_nr_and_not_polling(struct task_struct *p) ++{ ++ set_tsk_need_resched(p); ++ return true; ++} ++ ++#ifdef CONFIG_SMP ++static bool set_nr_if_polling(struct task_struct *p) ++{ ++ return false; ++} ++#endif ++#endif ++ ++#ifdef CONFIG_NO_HZ_FULL ++/* ++ * Tick may be needed by tasks in the runqueue depending on their policy and ++ * requirements. If tick is needed, lets send the target an IPI to kick it out ++ * of nohz mode if necessary. ++ */ ++static inline void sched_update_tick_dependency(struct rq *rq) ++{ ++ int cpu; ++ ++ if (!tick_nohz_full_enabled()) ++ return; ++ ++ cpu = cpu_of(rq); ++ ++ if (!tick_nohz_full_cpu(cpu)) ++ return; ++ ++ if (rq->nr_running < 2) ++ tick_nohz_dep_clear_cpu(cpu, TICK_DEP_BIT_SCHED); ++ else ++ tick_nohz_dep_set_cpu(cpu, TICK_DEP_BIT_SCHED); ++} ++#else /* !CONFIG_NO_HZ_FULL */ ++static inline void sched_update_tick_dependency(struct rq *rq) { } ++#endif ++ ++/* ++ * Add/Remove/Requeue task to/from the runqueue routines ++ * Context: rq->lock ++ */ ++static inline void dequeue_task(struct task_struct *p, struct rq *rq, int flags) ++{ ++ lockdep_assert_held(&rq->lock); ++ ++ WARN_ONCE(task_rq(p) != rq, "bmq: dequeue task reside on cpu%d from cpu%d\n", ++ task_cpu(p), cpu_of(rq)); ++ ++ list_del(&p->bmq_node); ++ if (list_empty(&rq->queue.heads[p->bmq_idx])) { ++ clear_bit(p->bmq_idx, rq->queue.bitmap); ++ update_sched_rq_watermark(rq); ++ } ++ --rq->nr_running; ++#ifdef CONFIG_SMP ++ if (1 == rq->nr_running) ++ cpumask_clear_cpu(cpu_of(rq), &sched_rq_pending_mask); ++#endif ++ ++ sched_update_tick_dependency(rq); ++ psi_dequeue(p, flags & DEQUEUE_SLEEP); ++ ++ sched_info_dequeued(rq, p); ++} ++ ++static inline void enqueue_task(struct task_struct *p, struct rq *rq, int flags) ++{ ++ lockdep_assert_held(&rq->lock); ++ ++ WARN_ONCE(task_rq(p) != rq, "bmq: enqueue task reside on cpu%d to cpu%d\n", ++ task_cpu(p), cpu_of(rq)); ++ ++ p->bmq_idx = task_sched_prio(p); ++ bmq_add_task(p, &rq->queue, p->bmq_idx); ++ set_bit(p->bmq_idx, rq->queue.bitmap); ++ update_sched_rq_watermark(rq); ++ ++rq->nr_running; ++#ifdef CONFIG_SMP ++ if (2 == rq->nr_running) ++ cpumask_set_cpu(cpu_of(rq), &sched_rq_pending_mask); ++#endif ++ ++ sched_update_tick_dependency(rq); ++ ++ sched_info_queued(rq, p); ++ psi_enqueue(p, flags); ++ ++ /* ++ * If in_iowait is set, the code below may not trigger any cpufreq ++ * utilization updates, so do it here explicitly with the IOWAIT flag ++ * passed. ++ */ ++ if (p->in_iowait) ++ cpufreq_update_util(rq, SCHED_CPUFREQ_IOWAIT); ++} ++ ++static inline void requeue_task(struct task_struct *p, struct rq *rq) ++{ ++ int idx = task_sched_prio(p); ++ ++ lockdep_assert_held(&rq->lock); ++ WARN_ONCE(task_rq(p) != rq, "bmq: cpu[%d] requeue task reside on cpu%d\n", ++ cpu_of(rq), task_cpu(p)); ++ ++ list_del(&p->bmq_node); ++ bmq_add_task(p, &rq->queue, idx); ++ if (idx != p->bmq_idx) { ++ if (list_empty(&rq->queue.heads[p->bmq_idx])) ++ clear_bit(p->bmq_idx, rq->queue.bitmap); ++ p->bmq_idx = idx; ++ set_bit(p->bmq_idx, rq->queue.bitmap); ++ update_sched_rq_watermark(rq); ++ } ++} ++ ++/* ++ * resched_curr - mark rq's current task 'to be rescheduled now'. ++ * ++ * On UP this means the setting of the need_resched flag, on SMP it ++ * might also involve a cross-CPU call to trigger the scheduler on ++ * the target CPU. ++ */ ++void resched_curr(struct rq *rq) ++{ ++ struct task_struct *curr = rq->curr; ++ int cpu; ++ ++ lockdep_assert_held(&rq->lock); ++ ++ if (test_tsk_need_resched(curr)) ++ return; ++ ++ cpu = cpu_of(rq); ++ if (cpu == smp_processor_id()) { ++ set_tsk_need_resched(curr); ++ set_preempt_need_resched(); ++ return; ++ } ++ ++ if (set_nr_and_not_polling(curr)) ++ smp_send_reschedule(cpu); ++ else ++ trace_sched_wake_idle_without_ipi(cpu); ++} ++ ++void resched_cpu(int cpu) ++{ ++ struct rq *rq = cpu_rq(cpu); ++ unsigned long flags; ++ ++ raw_spin_lock_irqsave(&rq->lock, flags); ++ if (cpu_online(cpu) || cpu == smp_processor_id()) ++ resched_curr(cpu_rq(cpu)); ++ raw_spin_unlock_irqrestore(&rq->lock, flags); ++} ++ ++static inline void check_preempt_curr(struct rq *rq) ++{ ++ if (rq_first_bmq_task(rq) != rq->curr) ++ resched_curr(rq); ++} ++ ++#ifdef CONFIG_SCHED_HRTICK ++/* ++ * Use HR-timers to deliver accurate preemption points. ++ */ ++ ++static void hrtick_clear(struct rq *rq) ++{ ++ if (hrtimer_active(&rq->hrtick_timer)) ++ hrtimer_cancel(&rq->hrtick_timer); ++} ++ ++/* ++ * High-resolution timer tick. ++ * Runs from hardirq context with interrupts disabled. ++ */ ++static enum hrtimer_restart hrtick(struct hrtimer *timer) ++{ ++ struct rq *rq = container_of(timer, struct rq, hrtick_timer); ++ struct task_struct *p; ++ ++ WARN_ON_ONCE(cpu_of(rq) != smp_processor_id()); ++ ++ raw_spin_lock(&rq->lock); ++ p = rq->curr; ++ p->time_slice = 0; ++ resched_curr(rq); ++ raw_spin_unlock(&rq->lock); ++ ++ return HRTIMER_NORESTART; ++} ++ ++/* ++ * Use hrtick when: ++ * - enabled by features ++ * - hrtimer is actually high res ++ */ ++static inline int hrtick_enabled(struct rq *rq) ++{ ++ /** ++ * BMQ doesn't support sched_feat yet ++ if (!sched_feat(HRTICK)) ++ return 0; ++ */ ++ if (!cpu_active(cpu_of(rq))) ++ return 0; ++ return hrtimer_is_hres_active(&rq->hrtick_timer); ++} ++ ++#ifdef CONFIG_SMP ++ ++static void __hrtick_restart(struct rq *rq) ++{ ++ struct hrtimer *timer = &rq->hrtick_timer; ++ ++ hrtimer_start_expires(timer, HRTIMER_MODE_ABS_PINNED_HARD); ++} ++ ++/* ++ * called from hardirq (IPI) context ++ */ ++static void __hrtick_start(void *arg) ++{ ++ struct rq *rq = arg; ++ ++ raw_spin_lock(&rq->lock); ++ __hrtick_restart(rq); ++ rq->hrtick_csd_pending = 0; ++ raw_spin_unlock(&rq->lock); ++} ++ ++/* ++ * Called to set the hrtick timer state. ++ * ++ * called with rq->lock held and irqs disabled ++ */ ++void hrtick_start(struct rq *rq, u64 delay) ++{ ++ struct hrtimer *timer = &rq->hrtick_timer; ++ ktime_t time; ++ s64 delta; ++ ++ /* ++ * Don't schedule slices shorter than 10000ns, that just ++ * doesn't make sense and can cause timer DoS. ++ */ ++ delta = max_t(s64, delay, 10000LL); ++ time = ktime_add_ns(timer->base->get_time(), delta); ++ ++ hrtimer_set_expires(timer, time); ++ ++ if (rq == this_rq()) { ++ __hrtick_restart(rq); ++ } else if (!rq->hrtick_csd_pending) { ++ smp_call_function_single_async(cpu_of(rq), &rq->hrtick_csd); ++ rq->hrtick_csd_pending = 1; ++ } ++} ++ ++#else ++/* ++ * Called to set the hrtick timer state. ++ * ++ * called with rq->lock held and irqs disabled ++ */ ++void hrtick_start(struct rq *rq, u64 delay) ++{ ++ /* ++ * Don't schedule slices shorter than 10000ns, that just ++ * doesn't make sense. Rely on vruntime for fairness. ++ */ ++ delay = max_t(u64, delay, 10000LL); ++ hrtimer_start(&rq->hrtick_timer, ns_to_ktime(delay), ++ HRTIMER_MODE_REL_PINNED_HARD); ++} ++#endif /* CONFIG_SMP */ ++ ++static void hrtick_rq_init(struct rq *rq) ++{ ++#ifdef CONFIG_SMP ++ rq->hrtick_csd_pending = 0; ++ ++ rq->hrtick_csd.flags = 0; ++ rq->hrtick_csd.func = __hrtick_start; ++ rq->hrtick_csd.info = rq; ++#endif ++ ++ hrtimer_init(&rq->hrtick_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL_HARD); ++ rq->hrtick_timer.function = hrtick; ++} ++#else /* CONFIG_SCHED_HRTICK */ ++static inline int hrtick_enabled(struct rq *rq) ++{ ++ return 0; ++} ++ ++static inline void hrtick_clear(struct rq *rq) ++{ ++} ++ ++static inline void hrtick_rq_init(struct rq *rq) ++{ ++} ++#endif /* CONFIG_SCHED_HRTICK */ ++ ++static inline int normal_prio(struct task_struct *p) ++{ ++ if (task_has_rt_policy(p)) ++ return MAX_RT_PRIO - 1 - p->rt_priority; ++ ++ return p->static_prio + MAX_PRIORITY_ADJ; ++} ++ ++/* ++ * Calculate the current priority, i.e. the priority ++ * taken into account by the scheduler. This value might ++ * be boosted by RT tasks as it will be RT if the task got ++ * RT-boosted. If not then it returns p->normal_prio. ++ */ ++static int effective_prio(struct task_struct *p) ++{ ++ p->normal_prio = normal_prio(p); ++ /* ++ * If we are RT tasks or we were boosted to RT priority, ++ * keep the priority unchanged. Otherwise, update priority ++ * to the normal priority: ++ */ ++ if (!rt_prio(p->prio)) ++ return p->normal_prio; ++ return p->prio; ++} ++ ++/* ++ * activate_task - move a task to the runqueue. ++ * ++ * Context: rq->lock ++ */ ++static void activate_task(struct task_struct *p, struct rq *rq) ++{ ++ if (task_contributes_to_load(p)) ++ rq->nr_uninterruptible--; ++ enqueue_task(p, rq, ENQUEUE_WAKEUP); ++ p->on_rq = TASK_ON_RQ_QUEUED; ++ cpufreq_update_util(rq, 0); ++} ++ ++/* ++ * deactivate_task - remove a task from the runqueue. ++ * ++ * Context: rq->lock ++ */ ++static inline void deactivate_task(struct task_struct *p, struct rq *rq) ++{ ++ if (task_contributes_to_load(p)) ++ rq->nr_uninterruptible++; ++ dequeue_task(p, rq, DEQUEUE_SLEEP); ++ p->on_rq = 0; ++ cpufreq_update_util(rq, 0); ++} ++ ++static inline void __set_task_cpu(struct task_struct *p, unsigned int cpu) ++{ ++#ifdef CONFIG_SMP ++ /* ++ * After ->cpu is set up to a new value, task_access_lock(p, ...) can be ++ * successfully executed on another CPU. We must ensure that updates of ++ * per-task data have been completed by this moment. ++ */ ++ smp_wmb(); ++ ++#ifdef CONFIG_THREAD_INFO_IN_TASK ++ WRITE_ONCE(p->cpu, cpu); ++#else ++ WRITE_ONCE(task_thread_info(p)->cpu, cpu); ++#endif ++#endif ++} ++ ++#ifdef CONFIG_SMP ++void set_task_cpu(struct task_struct *p, unsigned int new_cpu) ++{ ++#ifdef CONFIG_SCHED_DEBUG ++ /* ++ * We should never call set_task_cpu() on a blocked task, ++ * ttwu() will sort out the placement. ++ */ ++ WARN_ON_ONCE(p->state != TASK_RUNNING && p->state != TASK_WAKING && ++ !p->on_rq); ++#ifdef CONFIG_LOCKDEP ++ /* ++ * The caller should hold either p->pi_lock or rq->lock, when changing ++ * a task's CPU. ->pi_lock for waking tasks, rq->lock for runnable tasks. ++ * ++ * sched_move_task() holds both and thus holding either pins the cgroup, ++ * see task_group(). ++ */ ++ WARN_ON_ONCE(debug_locks && !(lockdep_is_held(&p->pi_lock) || ++ lockdep_is_held(&task_rq(p)->lock))); ++#endif ++ /* ++ * Clearly, migrating tasks to offline CPUs is a fairly daft thing. ++ */ ++ WARN_ON_ONCE(!cpu_online(new_cpu)); ++#endif ++ if (task_cpu(p) == new_cpu) ++ return; ++ trace_sched_migrate_task(p, new_cpu); ++ rseq_migrate(p); ++ perf_event_task_migrate(p); ++ ++ __set_task_cpu(p, new_cpu); ++} ++ ++static inline bool is_per_cpu_kthread(struct task_struct *p) ++{ ++ return ((p->flags & PF_KTHREAD) && (1 == p->nr_cpus_allowed)); ++} ++ ++/* ++ * Per-CPU kthreads are allowed to run on !active && online CPUs, see ++ * __set_cpus_allowed_ptr() and select_fallback_rq(). ++ */ ++static inline bool is_cpu_allowed(struct task_struct *p, int cpu) ++{ ++ if (!cpumask_test_cpu(cpu, p->cpus_ptr)) ++ return false; ++ ++ if (is_per_cpu_kthread(p)) ++ return cpu_online(cpu); ++ ++ return cpu_active(cpu); ++} ++ ++/* ++ * This is how migration works: ++ * ++ * 1) we invoke migration_cpu_stop() on the target CPU using ++ * stop_one_cpu(). ++ * 2) stopper starts to run (implicitly forcing the migrated thread ++ * off the CPU) ++ * 3) it checks whether the migrated task is still in the wrong runqueue. ++ * 4) if it's in the wrong runqueue then the migration thread removes ++ * it and puts it into the right queue. ++ * 5) stopper completes and stop_one_cpu() returns and the migration ++ * is done. ++ */ ++ ++/* ++ * move_queued_task - move a queued task to new rq. ++ * ++ * Returns (locked) new rq. Old rq's lock is released. ++ */ ++static struct rq *move_queued_task(struct rq *rq, struct task_struct *p, int ++ new_cpu) ++{ ++ lockdep_assert_held(&rq->lock); ++ ++ WRITE_ONCE(p->on_rq, TASK_ON_RQ_MIGRATING); ++ dequeue_task(p, rq, 0); ++ set_task_cpu(p, new_cpu); ++ raw_spin_unlock(&rq->lock); ++ ++ rq = cpu_rq(new_cpu); ++ ++ raw_spin_lock(&rq->lock); ++ BUG_ON(task_cpu(p) != new_cpu); ++ enqueue_task(p, rq, 0); ++ p->on_rq = TASK_ON_RQ_QUEUED; ++ check_preempt_curr(rq); ++ ++ return rq; ++} ++ ++struct migration_arg { ++ struct task_struct *task; ++ int dest_cpu; ++}; ++ ++/* ++ * Move (not current) task off this CPU, onto the destination CPU. We're doing ++ * this because either it can't run here any more (set_cpus_allowed() ++ * away from this CPU, or CPU going down), or because we're ++ * attempting to rebalance this task on exec (sched_exec). ++ * ++ * So we race with normal scheduler movements, but that's OK, as long ++ * as the task is no longer on this CPU. ++ */ ++static struct rq *__migrate_task(struct rq *rq, struct task_struct *p, int ++ dest_cpu) ++{ ++ /* Affinity changed (again). */ ++ if (!is_cpu_allowed(p, dest_cpu)) ++ return rq; ++ ++ update_rq_clock(rq); ++ return move_queued_task(rq, p, dest_cpu); ++} ++ ++/* ++ * migration_cpu_stop - this will be executed by a highprio stopper thread ++ * and performs thread migration by bumping thread off CPU then ++ * 'pushing' onto another runqueue. ++ */ ++static int migration_cpu_stop(void *data) ++{ ++ struct migration_arg *arg = data; ++ struct task_struct *p = arg->task; ++ struct rq *rq = this_rq(); ++ ++ /* ++ * The original target CPU might have gone down and we might ++ * be on another CPU but it doesn't matter. ++ */ ++ local_irq_disable(); ++ ++ raw_spin_lock(&p->pi_lock); ++ raw_spin_lock(&rq->lock); ++ /* ++ * If task_rq(p) != rq, it cannot be migrated here, because we're ++ * holding rq->lock, if p->on_rq == 0 it cannot get enqueued because ++ * we're holding p->pi_lock. ++ */ ++ if (task_rq(p) == rq && task_on_rq_queued(p)) ++ rq = __migrate_task(rq, p, arg->dest_cpu); ++ raw_spin_unlock(&rq->lock); ++ raw_spin_unlock(&p->pi_lock); ++ ++ local_irq_enable(); ++ return 0; ++} ++ ++static inline void ++set_cpus_allowed_common(struct task_struct *p, const struct cpumask *new_mask) ++{ ++ cpumask_copy(&p->cpus_mask, new_mask); ++ p->nr_cpus_allowed = cpumask_weight(new_mask); ++} ++ ++void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask) ++{ ++ set_cpus_allowed_common(p, new_mask); ++} ++#endif ++ ++/* Enter with rq lock held. We know p is on the local CPU */ ++static inline void __set_tsk_resched(struct task_struct *p) ++{ ++ set_tsk_need_resched(p); ++ set_preempt_need_resched(); ++} ++ ++/** ++ * task_curr - is this task currently executing on a CPU? ++ * @p: the task in question. ++ * ++ * Return: 1 if the task is currently executing. 0 otherwise. ++ */ ++inline int task_curr(const struct task_struct *p) ++{ ++ return cpu_curr(task_cpu(p)) == p; ++} ++ ++#ifdef CONFIG_SMP ++/* ++ * wait_task_inactive - wait for a thread to unschedule. ++ * ++ * If @match_state is nonzero, it's the @p->state value just checked and ++ * not expected to change. If it changes, i.e. @p might have woken up, ++ * then return zero. When we succeed in waiting for @p to be off its CPU, ++ * we return a positive number (its total switch count). If a second call ++ * a short while later returns the same number, the caller can be sure that ++ * @p has remained unscheduled the whole time. ++ * ++ * The caller must ensure that the task *will* unschedule sometime soon, ++ * else this function might spin for a *long* time. This function can't ++ * be called with interrupts off, or it may introduce deadlock with ++ * smp_call_function() if an IPI is sent by the same process we are ++ * waiting to become inactive. ++ */ ++unsigned long wait_task_inactive(struct task_struct *p, long match_state) ++{ ++ unsigned long flags; ++ bool running, on_rq; ++ unsigned long ncsw; ++ struct rq *rq; ++ raw_spinlock_t *lock; ++ ++ for (;;) { ++ rq = task_rq(p); ++ ++ /* ++ * If the task is actively running on another CPU ++ * still, just relax and busy-wait without holding ++ * any locks. ++ * ++ * NOTE! Since we don't hold any locks, it's not ++ * even sure that "rq" stays as the right runqueue! ++ * But we don't care, since this will return false ++ * if the runqueue has changed and p is actually now ++ * running somewhere else! ++ */ ++ while (task_running(p) && p == rq->curr) { ++ if (match_state && unlikely(p->state != match_state)) ++ return 0; ++ cpu_relax(); ++ } ++ ++ /* ++ * Ok, time to look more closely! We need the rq ++ * lock now, to be *sure*. If we're wrong, we'll ++ * just go back and repeat. ++ */ ++ task_access_lock_irqsave(p, &lock, &flags); ++ trace_sched_wait_task(p); ++ running = task_running(p); ++ on_rq = p->on_rq; ++ ncsw = 0; ++ if (!match_state || p->state == match_state) ++ ncsw = p->nvcsw | LONG_MIN; /* sets MSB */ ++ task_access_unlock_irqrestore(p, lock, &flags); ++ ++ /* ++ * If it changed from the expected state, bail out now. ++ */ ++ if (unlikely(!ncsw)) ++ break; ++ ++ /* ++ * Was it really running after all now that we ++ * checked with the proper locks actually held? ++ * ++ * Oops. Go back and try again.. ++ */ ++ if (unlikely(running)) { ++ cpu_relax(); ++ continue; ++ } ++ ++ /* ++ * It's not enough that it's not actively running, ++ * it must be off the runqueue _entirely_, and not ++ * preempted! ++ * ++ * So if it was still runnable (but just not actively ++ * running right now), it's preempted, and we should ++ * yield - it could be a while. ++ */ ++ if (unlikely(on_rq)) { ++ ktime_t to = NSEC_PER_SEC / HZ; ++ ++ set_current_state(TASK_UNINTERRUPTIBLE); ++ schedule_hrtimeout(&to, HRTIMER_MODE_REL); ++ continue; ++ } ++ ++ /* ++ * Ahh, all good. It wasn't running, and it wasn't ++ * runnable, which means that it will never become ++ * running in the future either. We're all done! ++ */ ++ break; ++ } ++ ++ return ncsw; ++} ++ ++/*** ++ * kick_process - kick a running thread to enter/exit the kernel ++ * @p: the to-be-kicked thread ++ * ++ * Cause a process which is running on another CPU to enter ++ * kernel-mode, without any delay. (to get signals handled.) ++ * ++ * NOTE: this function doesn't have to take the runqueue lock, ++ * because all it wants to ensure is that the remote task enters ++ * the kernel. If the IPI races and the task has been migrated ++ * to another CPU then no harm is done and the purpose has been ++ * achieved as well. ++ */ ++void kick_process(struct task_struct *p) ++{ ++ int cpu; ++ ++ preempt_disable(); ++ cpu = task_cpu(p); ++ if ((cpu != smp_processor_id()) && task_curr(p)) ++ smp_send_reschedule(cpu); ++ preempt_enable(); ++} ++EXPORT_SYMBOL_GPL(kick_process); ++ ++/* ++ * ->cpus_ptr is protected by both rq->lock and p->pi_lock ++ * ++ * A few notes on cpu_active vs cpu_online: ++ * ++ * - cpu_active must be a subset of cpu_online ++ * ++ * - on CPU-up we allow per-CPU kthreads on the online && !active CPU, ++ * see __set_cpus_allowed_ptr(). At this point the newly online ++ * CPU isn't yet part of the sched domains, and balancing will not ++ * see it. ++ * ++ * - on cpu-down we clear cpu_active() to mask the sched domains and ++ * avoid the load balancer to place new tasks on the to be removed ++ * CPU. Existing tasks will remain running there and will be taken ++ * off. ++ * ++ * This means that fallback selection must not select !active CPUs. ++ * And can assume that any active CPU must be online. Conversely ++ * select_task_rq() below may allow selection of !active CPUs in order ++ * to satisfy the above rules. ++ */ ++static int select_fallback_rq(int cpu, struct task_struct *p) ++{ ++ int nid = cpu_to_node(cpu); ++ const struct cpumask *nodemask = NULL; ++ enum { cpuset, possible, fail } state = cpuset; ++ int dest_cpu; ++ ++ /* ++ * If the node that the CPU is on has been offlined, cpu_to_node() ++ * will return -1. There is no CPU on the node, and we should ++ * select the CPU on the other node. ++ */ ++ if (nid != -1) { ++ nodemask = cpumask_of_node(nid); ++ ++ /* Look for allowed, online CPU in same node. */ ++ for_each_cpu(dest_cpu, nodemask) { ++ if (!cpu_active(dest_cpu)) ++ continue; ++ if (cpumask_test_cpu(dest_cpu, p->cpus_ptr)) ++ return dest_cpu; ++ } ++ } ++ ++ for (;;) { ++ /* Any allowed, online CPU? */ ++ for_each_cpu(dest_cpu, p->cpus_ptr) { ++ if (!is_cpu_allowed(p, dest_cpu)) ++ continue; ++ goto out; ++ } ++ ++ /* No more Mr. Nice Guy. */ ++ switch (state) { ++ case cpuset: ++ if (IS_ENABLED(CONFIG_CPUSETS)) { ++ cpuset_cpus_allowed_fallback(p); ++ state = possible; ++ break; ++ } ++ /* Fall-through */ ++ case possible: ++ do_set_cpus_allowed(p, cpu_possible_mask); ++ state = fail; ++ break; ++ ++ case fail: ++ BUG(); ++ break; ++ } ++ } ++ ++out: ++ if (state != cpuset) { ++ /* ++ * Don't tell them about moving exiting tasks or ++ * kernel threads (both mm NULL), since they never ++ * leave kernel. ++ */ ++ if (p->mm && printk_ratelimit()) { ++ printk_deferred("process %d (%s) no longer affine to cpu%d\n", ++ task_pid_nr(p), p->comm, cpu); ++ } ++ } ++ ++ return dest_cpu; ++} ++ ++static inline int __best_mask_cpu(int cpu, const cpumask_t *cpumask) ++{ ++ cpumask_t *mask = &(per_cpu(sched_cpu_affinity_masks, cpu)[0]); ++ while ((cpu = cpumask_any_and(cpumask, mask)) >= nr_cpu_ids) ++ mask++; ++ return cpu; ++} ++ ++static inline int best_mask_cpu(int cpu, const cpumask_t *cpumask) ++{ ++ return cpumask_test_cpu(cpu, cpumask)? cpu:__best_mask_cpu(cpu, cpumask); ++} ++ ++/* ++ * wake flags ++ */ ++#define WF_SYNC 0x01 /* waker goes to sleep after wakeup */ ++#define WF_FORK 0x02 /* child wakeup after fork */ ++#define WF_MIGRATED 0x04 /* internal use, task got migrated */ ++ ++static inline int select_task_rq(struct task_struct *p) ++{ ++ cpumask_t chk_mask, tmp; ++ ++ if (unlikely(!cpumask_and(&chk_mask, p->cpus_ptr, cpu_online_mask))) ++ return select_fallback_rq(task_cpu(p), p); ++ ++ if ( ++#ifdef CONFIG_SCHED_SMT ++ cpumask_and(&tmp, &chk_mask, &sched_sg_idle_mask) || ++#endif ++ cpumask_and(&tmp, &chk_mask, &sched_rq_watermark[IDLE_WM]) || ++ cpumask_and(&tmp, &chk_mask, ++ &sched_rq_watermark[task_sched_prio(p) + 1])) ++ return best_mask_cpu(task_cpu(p), &tmp); ++ ++ return best_mask_cpu(task_cpu(p), &chk_mask); ++} ++#else /* CONFIG_SMP */ ++static inline int select_task_rq(struct task_struct *p) ++{ ++ return 0; ++} ++#endif /* CONFIG_SMP */ ++ ++static void ++ttwu_stat(struct task_struct *p, int cpu, int wake_flags) ++{ ++ struct rq *rq; ++ ++ if (!schedstat_enabled()) ++ return; ++ ++ rq= this_rq(); ++ ++#ifdef CONFIG_SMP ++ if (cpu == rq->cpu) ++ __schedstat_inc(rq->ttwu_local); ++ else { ++ /** BMQ ToDo: ++ * How to do ttwu_wake_remote ++ */ ++ } ++#endif /* CONFIG_SMP */ ++ ++ __schedstat_inc(rq->ttwu_count); ++} ++ ++/* ++ * Mark the task runnable and perform wakeup-preemption. ++ */ ++static inline void ++ttwu_do_wakeup(struct rq *rq, struct task_struct *p, int wake_flags) ++{ ++ p->state = TASK_RUNNING; ++ trace_sched_wakeup(p); ++} ++ ++static inline void ++ttwu_do_activate(struct rq *rq, struct task_struct *p, int wake_flags) ++{ ++#ifdef CONFIG_SMP ++ if (p->sched_contributes_to_load) ++ rq->nr_uninterruptible--; ++#endif ++ ++ activate_task(p, rq); ++ ttwu_do_wakeup(rq, p, 0); ++} ++ ++static inline void ttwu_queue(struct task_struct *p, int cpu, int wake_flags) ++{ ++ struct rq *rq = cpu_rq(cpu); ++ ++ raw_spin_lock(&rq->lock); ++ update_rq_clock(rq); ++ ttwu_do_activate(rq, p, wake_flags); ++ check_preempt_curr(rq); ++ raw_spin_unlock(&rq->lock); ++} ++ ++static int ttwu_remote(struct task_struct *p, int wake_flags) ++{ ++ struct rq *rq; ++ raw_spinlock_t *lock; ++ int ret = 0; ++ ++ rq = __task_access_lock(p, &lock); ++ if (task_on_rq_queued(p)) { ++ ttwu_do_wakeup(rq, p, wake_flags); ++ ret = 1; ++ } ++ __task_access_unlock(p, lock); ++ ++ return ret; ++} ++ ++/* ++ * Notes on Program-Order guarantees on SMP systems. ++ * ++ * MIGRATION ++ * ++ * The basic program-order guarantee on SMP systems is that when a task [t] ++ * migrates, all its activity on its old CPU [c0] happens-before any subsequent ++ * execution on its new CPU [c1]. ++ * ++ * For migration (of runnable tasks) this is provided by the following means: ++ * ++ * A) UNLOCK of the rq(c0)->lock scheduling out task t ++ * B) migration for t is required to synchronize *both* rq(c0)->lock and ++ * rq(c1)->lock (if not at the same time, then in that order). ++ * C) LOCK of the rq(c1)->lock scheduling in task ++ * ++ * Transitivity guarantees that B happens after A and C after B. ++ * Note: we only require RCpc transitivity. ++ * Note: the CPU doing B need not be c0 or c1 ++ * ++ * Example: ++ * ++ * CPU0 CPU1 CPU2 ++ * ++ * LOCK rq(0)->lock ++ * sched-out X ++ * sched-in Y ++ * UNLOCK rq(0)->lock ++ * ++ * LOCK rq(0)->lock // orders against CPU0 ++ * dequeue X ++ * UNLOCK rq(0)->lock ++ * ++ * LOCK rq(1)->lock ++ * enqueue X ++ * UNLOCK rq(1)->lock ++ * ++ * LOCK rq(1)->lock // orders against CPU2 ++ * sched-out Z ++ * sched-in X ++ * UNLOCK rq(1)->lock ++ * ++ * ++ * BLOCKING -- aka. SLEEP + WAKEUP ++ * ++ * For blocking we (obviously) need to provide the same guarantee as for ++ * migration. However the means are completely different as there is no lock ++ * chain to provide order. Instead we do: ++ * ++ * 1) smp_store_release(X->on_cpu, 0) ++ * 2) smp_cond_load_acquire(!X->on_cpu) ++ * ++ * Example: ++ * ++ * CPU0 (schedule) CPU1 (try_to_wake_up) CPU2 (schedule) ++ * ++ * LOCK rq(0)->lock LOCK X->pi_lock ++ * dequeue X ++ * sched-out X ++ * smp_store_release(X->on_cpu, 0); ++ * ++ * smp_cond_load_acquire(&X->on_cpu, !VAL); ++ * X->state = WAKING ++ * set_task_cpu(X,2) ++ * ++ * LOCK rq(2)->lock ++ * enqueue X ++ * X->state = RUNNING ++ * UNLOCK rq(2)->lock ++ * ++ * LOCK rq(2)->lock // orders against CPU1 ++ * sched-out Z ++ * sched-in X ++ * UNLOCK rq(2)->lock ++ * ++ * UNLOCK X->pi_lock ++ * UNLOCK rq(0)->lock ++ * ++ * ++ * However; for wakeups there is a second guarantee we must provide, namely we ++ * must observe the state that lead to our wakeup. That is, not only must our ++ * task observe its own prior state, it must also observe the stores prior to ++ * its wakeup. ++ * ++ * This means that any means of doing remote wakeups must order the CPU doing ++ * the wakeup against the CPU the task is going to end up running on. This, ++ * however, is already required for the regular Program-Order guarantee above, ++ * since the waking CPU is the one issueing the ACQUIRE (smp_cond_load_acquire). ++ * ++ */ ++ ++/*** ++ * try_to_wake_up - wake up a thread ++ * @p: the thread to be awakened ++ * @state: the mask of task states that can be woken ++ * @wake_flags: wake modifier flags (WF_*) ++ * ++ * Put it on the run-queue if it's not already there. The "current" ++ * thread is always on the run-queue (except when the actual ++ * re-schedule is in progress), and as such you're allowed to do ++ * the simpler "current->state = TASK_RUNNING" to mark yourself ++ * runnable without the overhead of this. ++ * ++ * Return: %true if @p was woken up, %false if it was already running. ++ * or @state didn't match @p's state. ++ */ ++static int try_to_wake_up(struct task_struct *p, unsigned int state, ++ int wake_flags) ++{ ++ unsigned long flags; ++ int cpu, success = 0; ++ ++ preempt_disable(); ++ if (p == current) { ++ /* ++ * We're waking current, this means 'p->on_rq' and 'task_cpu(p) ++ * == smp_processor_id()'. Together this means we can special ++ * case the whole 'p->on_rq && ttwu_remote()' case below ++ * without taking any locks. ++ * ++ * In particular: ++ * - we rely on Program-Order guarantees for all the ordering, ++ * - we're serialized against set_special_state() by virtue of ++ * it disabling IRQs (this allows not taking ->pi_lock). ++ */ ++ if (!(p->state & state)) ++ goto out; ++ ++ success = 1; ++ cpu = task_cpu(p); ++ trace_sched_waking(p); ++ p->state = TASK_RUNNING; ++ trace_sched_wakeup(p); ++ goto out; ++ } ++ ++ /* ++ * If we are going to wake up a thread waiting for CONDITION we ++ * need to ensure that CONDITION=1 done by the caller can not be ++ * reordered with p->state check below. This pairs with mb() in ++ * set_current_state() the waiting thread does. ++ */ ++ raw_spin_lock_irqsave(&p->pi_lock, flags); ++ smp_mb__after_spinlock(); ++ if (!(p->state & state)) ++ goto unlock; ++ ++ trace_sched_waking(p); ++ ++ /* We're going to change ->state: */ ++ success = 1; ++ cpu = task_cpu(p); ++ ++ /* ++ * Ensure we load p->on_rq _after_ p->state, otherwise it would ++ * be possible to, falsely, observe p->on_rq == 0 and get stuck ++ * in smp_cond_load_acquire() below. ++ * ++ * sched_ttwu_pending() try_to_wake_up() ++ * STORE p->on_rq = 1 LOAD p->state ++ * UNLOCK rq->lock ++ * ++ * __schedule() (switch to task 'p') ++ * LOCK rq->lock smp_rmb(); ++ * smp_mb__after_spinlock(); ++ * UNLOCK rq->lock ++ * ++ * [task p] ++ * STORE p->state = UNINTERRUPTIBLE LOAD p->on_rq ++ * ++ * Pairs with the LOCK+smp_mb__after_spinlock() on rq->lock in ++ * __schedule(). See the comment for smp_mb__after_spinlock(). ++ */ ++ smp_rmb(); ++ if (p->on_rq && ttwu_remote(p, wake_flags)) ++ goto unlock; ++ ++#ifdef CONFIG_SMP ++ /* ++ * Ensure we load p->on_cpu _after_ p->on_rq, otherwise it would be ++ * possible to, falsely, observe p->on_cpu == 0. ++ * ++ * One must be running (->on_cpu == 1) in order to remove oneself ++ * from the runqueue. ++ * ++ * __schedule() (switch to task 'p') try_to_wake_up() ++ * STORE p->on_cpu = 1 LOAD p->on_rq ++ * UNLOCK rq->lock ++ * ++ * __schedule() (put 'p' to sleep) ++ * LOCK rq->lock smp_rmb(); ++ * smp_mb__after_spinlock(); ++ * STORE p->on_rq = 0 LOAD p->on_cpu ++ * ++ * Pairs with the LOCK+smp_mb__after_spinlock() on rq->lock in ++ * __schedule(). See the comment for smp_mb__after_spinlock(). ++ */ ++ smp_rmb(); ++ ++ /* ++ * If the owning (remote) CPU is still in the middle of schedule() with ++ * this task as prev, wait until its done referencing the task. ++ * ++ * Pairs with the smp_store_release() in finish_task(). ++ * ++ * This ensures that tasks getting woken will be fully ordered against ++ * their previous state and preserve Program Order. ++ */ ++ smp_cond_load_acquire(&p->on_cpu, !VAL); ++ ++ p->sched_contributes_to_load = !!task_contributes_to_load(p); ++ p->state = TASK_WAKING; ++ ++ if (p->in_iowait) { ++ delayacct_blkio_end(p); ++ atomic_dec(&task_rq(p)->nr_iowait); ++ } ++ ++ if(cpu_rq(smp_processor_id())->clock - p->last_ran > SCHED_TIMESLICE_NS) ++ boost_task(p); ++ ++ cpu = select_task_rq(p); ++ ++ if (cpu != task_cpu(p)) { ++ wake_flags |= WF_MIGRATED; ++ psi_ttwu_dequeue(p); ++ set_task_cpu(p, cpu); ++ } ++#else /* CONFIG_SMP */ ++ if (p->in_iowait) { ++ delayacct_blkio_end(p); ++ atomic_dec(&task_rq(p)->nr_iowait); ++ } ++#endif /* CONFIG_SMP */ ++ ++ ttwu_queue(p, cpu, wake_flags); ++unlock: ++ raw_spin_unlock_irqrestore(&p->pi_lock, flags); ++out: ++ if (success) ++ ttwu_stat(p, cpu, wake_flags); ++ preempt_enable(); ++ ++ return success; ++} ++ ++/** ++ * wake_up_process - Wake up a specific process ++ * @p: The process to be woken up. ++ * ++ * Attempt to wake up the nominated process and move it to the set of runnable ++ * processes. ++ * ++ * Return: 1 if the process was woken up, 0 if it was already running. ++ * ++ * This function executes a full memory barrier before accessing the task state. ++ */ ++int wake_up_process(struct task_struct *p) ++{ ++ return try_to_wake_up(p, TASK_NORMAL, 0); ++} ++EXPORT_SYMBOL(wake_up_process); ++ ++int wake_up_state(struct task_struct *p, unsigned int state) ++{ ++ return try_to_wake_up(p, state, 0); ++} ++ ++/* ++ * Perform scheduler related setup for a newly forked process p. ++ * p is forked by current. ++ */ ++int sched_fork(unsigned long __maybe_unused clone_flags, struct task_struct *p) ++{ ++ unsigned long flags; ++ int cpu = get_cpu(); ++ struct rq *rq = this_rq(); ++ ++#ifdef CONFIG_PREEMPT_NOTIFIERS ++ INIT_HLIST_HEAD(&p->preempt_notifiers); ++#endif ++ /* Should be reset in fork.c but done here for ease of BMQ patching */ ++ p->on_cpu = ++ p->on_rq = ++ p->utime = ++ p->stime = ++ p->sched_time = 0; ++ ++#ifdef CONFIG_COMPACTION ++ p->capture_control = NULL; ++#endif ++ ++ /* ++ * We mark the process as NEW here. This guarantees that ++ * nobody will actually run it, and a signal or other external ++ * event cannot wake it up and insert it on the runqueue either. ++ */ ++ p->state = TASK_NEW; ++ ++ /* ++ * Make sure we do not leak PI boosting priority to the child. ++ */ ++ p->prio = current->normal_prio; ++ ++ /* ++ * Revert to default priority/policy on fork if requested. ++ */ ++ if (unlikely(p->sched_reset_on_fork)) { ++ if (task_has_rt_policy(p)) { ++ p->policy = SCHED_NORMAL; ++ p->static_prio = NICE_TO_PRIO(0); ++ p->rt_priority = 0; ++ } else if (PRIO_TO_NICE(p->static_prio) < 0) ++ p->static_prio = NICE_TO_PRIO(0); ++ ++ p->prio = p->normal_prio = normal_prio(p); ++ ++ /* ++ * We don't need the reset flag anymore after the fork. It has ++ * fulfilled its duty: ++ */ ++ p->sched_reset_on_fork = 0; ++ } ++ ++ p->boost_prio = (p->boost_prio < 0) ? ++ p->boost_prio + MAX_PRIORITY_ADJ : MAX_PRIORITY_ADJ; ++ /* ++ * Share the timeslice between parent and child, thus the ++ * total amount of pending timeslices in the system doesn't change, ++ * resulting in more scheduling fairness. ++ */ ++ raw_spin_lock_irqsave(&rq->lock, flags); ++ rq->curr->time_slice /= 2; ++ p->time_slice = rq->curr->time_slice; ++#ifdef CONFIG_SCHED_HRTICK ++ hrtick_start(rq, rq->curr->time_slice); ++#endif ++ ++ if (p->time_slice < RESCHED_NS) { ++ p->time_slice = SCHED_TIMESLICE_NS; ++ resched_curr(rq); ++ } ++ raw_spin_unlock_irqrestore(&rq->lock, flags); ++ ++ /* ++ * The child is not yet in the pid-hash so no cgroup attach races, ++ * and the cgroup is pinned to this child due to cgroup_fork() ++ * is ran before sched_fork(). ++ * ++ * Silence PROVE_RCU. ++ */ ++ raw_spin_lock_irqsave(&p->pi_lock, flags); ++ /* ++ * We're setting the CPU for the first time, we don't migrate, ++ * so use __set_task_cpu(). ++ */ ++ __set_task_cpu(p, cpu); ++ raw_spin_unlock_irqrestore(&p->pi_lock, flags); ++ ++#ifdef CONFIG_SCHED_INFO ++ if (unlikely(sched_info_on())) ++ memset(&p->sched_info, 0, sizeof(p->sched_info)); ++#endif ++ init_task_preempt_count(p); ++ ++ put_cpu(); ++ return 0; ++} ++ ++#ifdef CONFIG_SCHEDSTATS ++ ++DEFINE_STATIC_KEY_FALSE(sched_schedstats); ++static bool __initdata __sched_schedstats = false; ++ ++static void set_schedstats(bool enabled) ++{ ++ if (enabled) ++ static_branch_enable(&sched_schedstats); ++ else ++ static_branch_disable(&sched_schedstats); ++} ++ ++void force_schedstat_enabled(void) ++{ ++ if (!schedstat_enabled()) { ++ pr_info("kernel profiling enabled schedstats, disable via kernel.sched_schedstats.\n"); ++ static_branch_enable(&sched_schedstats); ++ } ++} ++ ++static int __init setup_schedstats(char *str) ++{ ++ int ret = 0; ++ if (!str) ++ goto out; ++ ++ /* ++ * This code is called before jump labels have been set up, so we can't ++ * change the static branch directly just yet. Instead set a temporary ++ * variable so init_schedstats() can do it later. ++ */ ++ if (!strcmp(str, "enable")) { ++ __sched_schedstats = true; ++ ret = 1; ++ } else if (!strcmp(str, "disable")) { ++ __sched_schedstats = false; ++ ret = 1; ++ } ++out: ++ if (!ret) ++ pr_warn("Unable to parse schedstats=\n"); ++ ++ return ret; ++} ++__setup("schedstats=", setup_schedstats); ++ ++static void __init init_schedstats(void) ++{ ++ set_schedstats(__sched_schedstats); ++} ++ ++#ifdef CONFIG_PROC_SYSCTL ++int sysctl_schedstats(struct ctl_table *table, int write, ++ void __user *buffer, size_t *lenp, loff_t *ppos) ++{ ++ struct ctl_table t; ++ int err; ++ int state = static_branch_likely(&sched_schedstats); ++ ++ if (write && !capable(CAP_SYS_ADMIN)) ++ return -EPERM; ++ ++ t = *table; ++ t.data = &state; ++ err = proc_dointvec_minmax(&t, write, buffer, lenp, ppos); ++ if (err < 0) ++ return err; ++ if (write) ++ set_schedstats(state); ++ return err; ++} ++#endif /* CONFIG_PROC_SYSCTL */ ++#else /* !CONFIG_SCHEDSTATS */ ++static inline void init_schedstats(void) {} ++#endif /* CONFIG_SCHEDSTATS */ ++ ++/* ++ * wake_up_new_task - wake up a newly created task for the first time. ++ * ++ * This function will do some initial scheduler statistics housekeeping ++ * that must be done for every newly created context, then puts the task ++ * on the runqueue and wakes it. ++ */ ++void wake_up_new_task(struct task_struct *p) ++{ ++ unsigned long flags; ++ struct rq *rq; ++ ++ raw_spin_lock_irqsave(&p->pi_lock, flags); ++ ++ p->state = TASK_RUNNING; ++ ++ rq = cpu_rq(select_task_rq(p)); ++#ifdef CONFIG_SMP ++ /* ++ * Fork balancing, do it here and not earlier because: ++ * - cpus_ptr can change in the fork path ++ * - any previously selected CPU might disappear through hotplug ++ * Use __set_task_cpu() to avoid calling sched_class::migrate_task_rq, ++ * as we're not fully set-up yet. ++ */ ++ __set_task_cpu(p, cpu_of(rq)); ++#endif ++ ++ raw_spin_lock(&rq->lock); ++ ++ update_rq_clock(rq); ++ activate_task(p, rq); ++ trace_sched_wakeup_new(p); ++ check_preempt_curr(rq); ++ ++ raw_spin_unlock(&rq->lock); ++ raw_spin_unlock_irqrestore(&p->pi_lock, flags); ++} ++ ++#ifdef CONFIG_PREEMPT_NOTIFIERS ++ ++static DEFINE_STATIC_KEY_FALSE(preempt_notifier_key); ++ ++void preempt_notifier_inc(void) ++{ ++ static_branch_inc(&preempt_notifier_key); ++} ++EXPORT_SYMBOL_GPL(preempt_notifier_inc); ++ ++void preempt_notifier_dec(void) ++{ ++ static_branch_dec(&preempt_notifier_key); ++} ++EXPORT_SYMBOL_GPL(preempt_notifier_dec); ++ ++/** ++ * preempt_notifier_register - tell me when current is being preempted & rescheduled ++ * @notifier: notifier struct to register ++ */ ++void preempt_notifier_register(struct preempt_notifier *notifier) ++{ ++ if (!static_branch_unlikely(&preempt_notifier_key)) ++ WARN(1, "registering preempt_notifier while notifiers disabled\n"); ++ ++ hlist_add_head(¬ifier->link, ¤t->preempt_notifiers); ++} ++EXPORT_SYMBOL_GPL(preempt_notifier_register); ++ ++/** ++ * preempt_notifier_unregister - no longer interested in preemption notifications ++ * @notifier: notifier struct to unregister ++ * ++ * This is *not* safe to call from within a preemption notifier. ++ */ ++void preempt_notifier_unregister(struct preempt_notifier *notifier) ++{ ++ hlist_del(¬ifier->link); ++} ++EXPORT_SYMBOL_GPL(preempt_notifier_unregister); ++ ++static void __fire_sched_in_preempt_notifiers(struct task_struct *curr) ++{ ++ struct preempt_notifier *notifier; ++ ++ hlist_for_each_entry(notifier, &curr->preempt_notifiers, link) ++ notifier->ops->sched_in(notifier, raw_smp_processor_id()); ++} ++ ++static __always_inline void fire_sched_in_preempt_notifiers(struct task_struct *curr) ++{ ++ if (static_branch_unlikely(&preempt_notifier_key)) ++ __fire_sched_in_preempt_notifiers(curr); ++} ++ ++static void ++__fire_sched_out_preempt_notifiers(struct task_struct *curr, ++ struct task_struct *next) ++{ ++ struct preempt_notifier *notifier; ++ ++ hlist_for_each_entry(notifier, &curr->preempt_notifiers, link) ++ notifier->ops->sched_out(notifier, next); ++} ++ ++static __always_inline void ++fire_sched_out_preempt_notifiers(struct task_struct *curr, ++ struct task_struct *next) ++{ ++ if (static_branch_unlikely(&preempt_notifier_key)) ++ __fire_sched_out_preempt_notifiers(curr, next); ++} ++ ++#else /* !CONFIG_PREEMPT_NOTIFIERS */ ++ ++static inline void fire_sched_in_preempt_notifiers(struct task_struct *curr) ++{ ++} ++ ++static inline void ++fire_sched_out_preempt_notifiers(struct task_struct *curr, ++ struct task_struct *next) ++{ ++} ++ ++#endif /* CONFIG_PREEMPT_NOTIFIERS */ ++ ++static inline void prepare_task(struct task_struct *next) ++{ ++ /* ++ * Claim the task as running, we do this before switching to it ++ * such that any running task will have this set. ++ */ ++ next->on_cpu = 1; ++} ++ ++static inline void finish_task(struct task_struct *prev) ++{ ++#ifdef CONFIG_SMP ++ /* ++ * After ->on_cpu is cleared, the task can be moved to a different CPU. ++ * We must ensure this doesn't happen until the switch is completely ++ * finished. ++ * ++ * In particular, the load of prev->state in finish_task_switch() must ++ * happen before this. ++ * ++ * Pairs with the smp_cond_load_acquire() in try_to_wake_up(). ++ */ ++ smp_store_release(&prev->on_cpu, 0); ++#else ++ prev->on_cpu = 0; ++#endif ++} ++ ++static inline void ++prepare_lock_switch(struct rq *rq, struct task_struct *next) ++{ ++ /* ++ * Since the runqueue lock will be released by the next ++ * task (which is an invalid locking op but in the case ++ * of the scheduler it's an obvious special-case), so we ++ * do an early lockdep release here: ++ */ ++ spin_release(&rq->lock.dep_map, 1, _THIS_IP_); ++#ifdef CONFIG_DEBUG_SPINLOCK ++ /* this is a valid case when another task releases the spinlock */ ++ rq->lock.owner = next; ++#endif ++} ++ ++static inline void finish_lock_switch(struct rq *rq) ++{ ++ /* ++ * If we are tracking spinlock dependencies then we have to ++ * fix up the runqueue lock - which gets 'carried over' from ++ * prev into current: ++ */ ++ spin_acquire(&rq->lock.dep_map, 0, 0, _THIS_IP_); ++ raw_spin_unlock_irq(&rq->lock); ++} ++ ++/** ++ * prepare_task_switch - prepare to switch tasks ++ * @rq: the runqueue preparing to switch ++ * @next: the task we are going to switch to. ++ * ++ * This is called with the rq lock held and interrupts off. It must ++ * be paired with a subsequent finish_task_switch after the context ++ * switch. ++ * ++ * prepare_task_switch sets up locking and calls architecture specific ++ * hooks. ++ */ ++static inline void ++prepare_task_switch(struct rq *rq, struct task_struct *prev, ++ struct task_struct *next) ++{ ++ kcov_prepare_switch(prev); ++ sched_info_switch(rq, prev, next); ++ perf_event_task_sched_out(prev, next); ++ rseq_preempt(prev); ++ fire_sched_out_preempt_notifiers(prev, next); ++ prepare_task(next); ++ prepare_arch_switch(next); ++} ++ ++/** ++ * finish_task_switch - clean up after a task-switch ++ * @rq: runqueue associated with task-switch ++ * @prev: the thread we just switched away from. ++ * ++ * finish_task_switch must be called after the context switch, paired ++ * with a prepare_task_switch call before the context switch. ++ * finish_task_switch will reconcile locking set up by prepare_task_switch, ++ * and do any other architecture-specific cleanup actions. ++ * ++ * Note that we may have delayed dropping an mm in context_switch(). If ++ * so, we finish that here outside of the runqueue lock. (Doing it ++ * with the lock held can cause deadlocks; see schedule() for ++ * details.) ++ * ++ * The context switch have flipped the stack from under us and restored the ++ * local variables which were saved when this task called schedule() in the ++ * past. prev == current is still correct but we need to recalculate this_rq ++ * because prev may have moved to another CPU. ++ */ ++static struct rq *finish_task_switch(struct task_struct *prev) ++ __releases(rq->lock) ++{ ++ struct rq *rq = this_rq(); ++ struct mm_struct *mm = rq->prev_mm; ++ long prev_state; ++ ++ /* ++ * The previous task will have left us with a preempt_count of 2 ++ * because it left us after: ++ * ++ * schedule() ++ * preempt_disable(); // 1 ++ * __schedule() ++ * raw_spin_lock_irq(&rq->lock) // 2 ++ * ++ * Also, see FORK_PREEMPT_COUNT. ++ */ ++ if (WARN_ONCE(preempt_count() != 2*PREEMPT_DISABLE_OFFSET, ++ "corrupted preempt_count: %s/%d/0x%x\n", ++ current->comm, current->pid, preempt_count())) ++ preempt_count_set(FORK_PREEMPT_COUNT); ++ ++ rq->prev_mm = NULL; ++ ++ /* ++ * A task struct has one reference for the use as "current". ++ * If a task dies, then it sets TASK_DEAD in tsk->state and calls ++ * schedule one last time. The schedule call will never return, and ++ * the scheduled task must drop that reference. ++ * ++ * We must observe prev->state before clearing prev->on_cpu (in ++ * finish_task), otherwise a concurrent wakeup can get prev ++ * running on another CPU and we could rave with its RUNNING -> DEAD ++ * transition, resulting in a double drop. ++ */ ++ prev_state = prev->state; ++ vtime_task_switch(prev); ++ perf_event_task_sched_in(prev, current); ++ finish_task(prev); ++ finish_lock_switch(rq); ++ finish_arch_post_lock_switch(); ++ kcov_finish_switch(current); ++ ++ fire_sched_in_preempt_notifiers(current); ++ /* ++ * When switching through a kernel thread, the loop in ++ * membarrier_{private,global}_expedited() may have observed that ++ * kernel thread and not issued an IPI. It is therefore possible to ++ * schedule between user->kernel->user threads without passing though ++ * switch_mm(). Membarrier requires a barrier after storing to ++ * rq->curr, before returning to userspace, so provide them here: ++ * ++ * - a full memory barrier for {PRIVATE,GLOBAL}_EXPEDITED, implicitly ++ * provided by mmdrop(), ++ * - a sync_core for SYNC_CORE. ++ */ ++ if (mm) { ++ membarrier_mm_sync_core_before_usermode(mm); ++ mmdrop(mm); ++ } ++ if (unlikely(prev_state == TASK_DEAD)) { ++ /* ++ * Remove function-return probe instances associated with this ++ * task and put them back on the free list. ++ */ ++ kprobe_flush_task(prev); ++ ++ /* Task is done with its stack. */ ++ put_task_stack(prev); ++ ++ put_task_struct_rcu_user(prev); ++ } ++ ++ tick_nohz_task_switch(); ++ return rq; ++} ++ ++/** ++ * schedule_tail - first thing a freshly forked thread must call. ++ * @prev: the thread we just switched away from. ++ */ ++asmlinkage __visible void schedule_tail(struct task_struct *prev) ++ __releases(rq->lock) ++{ ++ struct rq *rq; ++ ++ /* ++ * New tasks start with FORK_PREEMPT_COUNT, see there and ++ * finish_task_switch() for details. ++ * ++ * finish_task_switch() will drop rq->lock() and lower preempt_count ++ * and the preempt_enable() will end up enabling preemption (on ++ * PREEMPT_COUNT kernels). ++ */ ++ ++ rq = finish_task_switch(prev); ++ preempt_enable(); ++ ++ if (current->set_child_tid) ++ put_user(task_pid_vnr(current), current->set_child_tid); ++ ++ calculate_sigpending(); ++} ++ ++/* ++ * context_switch - switch to the new MM and the new thread's register state. ++ */ ++static __always_inline struct rq * ++context_switch(struct rq *rq, struct task_struct *prev, ++ struct task_struct *next) ++{ ++ prepare_task_switch(rq, prev, next); ++ ++ /* ++ * For paravirt, this is coupled with an exit in switch_to to ++ * combine the page table reload and the switch backend into ++ * one hypercall. ++ */ ++ arch_start_context_switch(prev); ++ ++ /* ++ * kernel -> kernel lazy + transfer active ++ * user -> kernel lazy + mmgrab() active ++ * ++ * kernel -> user switch + mmdrop() active ++ * user -> user switch ++ */ ++ if (!next->mm) { // to kernel ++ enter_lazy_tlb(prev->active_mm, next); ++ ++ next->active_mm = prev->active_mm; ++ if (prev->mm) // from user ++ mmgrab(prev->active_mm); ++ else ++ prev->active_mm = NULL; ++ } else { // to user ++ membarrier_switch_mm(rq, prev->active_mm, next->mm); ++ /* ++ * sys_membarrier() requires an smp_mb() between setting ++ * rq->curr / membarrier_switch_mm() and returning to userspace. ++ * ++ * The below provides this either through switch_mm(), or in ++ * case 'prev->active_mm == next->mm' through ++ * finish_task_switch()'s mmdrop(). ++ */ ++ switch_mm_irqs_off(prev->active_mm, next->mm, next); ++ ++ if (!prev->mm) { // from kernel ++ /* will mmdrop() in finish_task_switch(). */ ++ rq->prev_mm = prev->active_mm; ++ prev->active_mm = NULL; ++ } ++ } ++ ++ prepare_lock_switch(rq, next); ++ ++ /* Here we just switch the register state and the stack. */ ++ switch_to(prev, next, prev); ++ barrier(); ++ ++ return finish_task_switch(prev); ++} ++ ++/* ++ * nr_running, nr_uninterruptible and nr_context_switches: ++ * ++ * externally visible scheduler statistics: current number of runnable ++ * threads, total number of context switches performed since bootup. ++ */ ++unsigned long nr_running(void) ++{ ++ unsigned long i, sum = 0; ++ ++ for_each_online_cpu(i) ++ sum += cpu_rq(i)->nr_running; ++ ++ return sum; ++} ++ ++/* ++ * Check if only the current task is running on the CPU. ++ * ++ * Caution: this function does not check that the caller has disabled ++ * preemption, thus the result might have a time-of-check-to-time-of-use ++ * race. The caller is responsible to use it correctly, for example: ++ * ++ * - from a non-preemptible section (of course) ++ * ++ * - from a thread that is bound to a single CPU ++ * ++ * - in a loop with very short iterations (e.g. a polling loop) ++ */ ++bool single_task_running(void) ++{ ++ return raw_rq()->nr_running == 1; ++} ++EXPORT_SYMBOL(single_task_running); ++ ++unsigned long long nr_context_switches(void) ++{ ++ int i; ++ unsigned long long sum = 0; ++ ++ for_each_possible_cpu(i) ++ sum += cpu_rq(i)->nr_switches; ++ ++ return sum; ++} ++ ++/* ++ * Consumers of these two interfaces, like for example the cpuidle menu ++ * governor, are using nonsensical data. Preferring shallow idle state selection ++ * for a CPU that has IO-wait which might not even end up running the task when ++ * it does become runnable. ++ */ ++ ++unsigned long nr_iowait_cpu(int cpu) ++{ ++ return atomic_read(&cpu_rq(cpu)->nr_iowait); ++} ++ ++/* ++ * IO-wait accounting, and how its mostly bollocks (on SMP). ++ * ++ * The idea behind IO-wait account is to account the idle time that we could ++ * have spend running if it were not for IO. That is, if we were to improve the ++ * storage performance, we'd have a proportional reduction in IO-wait time. ++ * ++ * This all works nicely on UP, where, when a task blocks on IO, we account ++ * idle time as IO-wait, because if the storage were faster, it could've been ++ * running and we'd not be idle. ++ * ++ * This has been extended to SMP, by doing the same for each CPU. This however ++ * is broken. ++ * ++ * Imagine for instance the case where two tasks block on one CPU, only the one ++ * CPU will have IO-wait accounted, while the other has regular idle. Even ++ * though, if the storage were faster, both could've ran at the same time, ++ * utilising both CPUs. ++ * ++ * This means, that when looking globally, the current IO-wait accounting on ++ * SMP is a lower bound, by reason of under accounting. ++ * ++ * Worse, since the numbers are provided per CPU, they are sometimes ++ * interpreted per CPU, and that is nonsensical. A blocked task isn't strictly ++ * associated with any one particular CPU, it can wake to another CPU than it ++ * blocked on. This means the per CPU IO-wait number is meaningless. ++ * ++ * Task CPU affinities can make all that even more 'interesting'. ++ */ ++ ++unsigned long nr_iowait(void) ++{ ++ unsigned long i, sum = 0; ++ ++ for_each_possible_cpu(i) ++ sum += nr_iowait_cpu(i); ++ ++ return sum; ++} ++ ++DEFINE_PER_CPU(struct kernel_stat, kstat); ++DEFINE_PER_CPU(struct kernel_cpustat, kernel_cpustat); ++ ++EXPORT_PER_CPU_SYMBOL(kstat); ++EXPORT_PER_CPU_SYMBOL(kernel_cpustat); ++ ++static inline void update_curr(struct rq *rq, struct task_struct *p) ++{ ++ s64 ns = rq->clock_task - p->last_ran; ++ ++ p->sched_time += ns; ++ account_group_exec_runtime(p, ns); ++ ++ p->time_slice -= ns; ++ p->last_ran = rq->clock_task; ++} ++ ++/* ++ * Return accounted runtime for the task. ++ * Return separately the current's pending runtime that have not been ++ * accounted yet. ++ */ ++unsigned long long task_sched_runtime(struct task_struct *p) ++{ ++ unsigned long flags; ++ struct rq *rq; ++ raw_spinlock_t *lock; ++ u64 ns; ++ ++#if defined(CONFIG_64BIT) && defined(CONFIG_SMP) ++ /* ++ * 64-bit doesn't need locks to atomically read a 64-bit value. ++ * So we have a optimization chance when the task's delta_exec is 0. ++ * Reading ->on_cpu is racy, but this is ok. ++ * ++ * If we race with it leaving CPU, we'll take a lock. So we're correct. ++ * If we race with it entering CPU, unaccounted time is 0. This is ++ * indistinguishable from the read occurring a few cycles earlier. ++ * If we see ->on_cpu without ->on_rq, the task is leaving, and has ++ * been accounted, so we're correct here as well. ++ */ ++ if (!p->on_cpu || !task_on_rq_queued(p)) ++ return tsk_seruntime(p); ++#endif ++ ++ rq = task_access_lock_irqsave(p, &lock, &flags); ++ /* ++ * Must be ->curr _and_ ->on_rq. If dequeued, we would ++ * project cycles that may never be accounted to this ++ * thread, breaking clock_gettime(). ++ */ ++ if (p == rq->curr && task_on_rq_queued(p)) { ++ update_rq_clock(rq); ++ update_curr(rq, p); ++ } ++ ns = tsk_seruntime(p); ++ task_access_unlock_irqrestore(p, lock, &flags); ++ ++ return ns; ++} ++ ++/* This manages tasks that have run out of timeslice during a scheduler_tick */ ++static inline void scheduler_task_tick(struct rq *rq) ++{ ++ struct task_struct *p = rq->curr; ++ ++ if (is_idle_task(p)) ++ return; ++ ++ update_curr(rq, p); ++ cpufreq_update_util(rq, 0); ++ ++ /* ++ * Tasks have less than RESCHED_NS of time slice left they will be ++ * rescheduled. ++ */ ++ if (p->time_slice >= RESCHED_NS) ++ return; ++ __set_tsk_resched(p); ++} ++ ++#ifdef CONFIG_SCHED_SMT ++static inline int active_load_balance_cpu_stop(void *data) ++{ ++ struct rq *rq = this_rq(); ++ struct task_struct *p = data; ++ cpumask_t tmp; ++ unsigned long flags; ++ ++ local_irq_save(flags); ++ ++ raw_spin_lock(&p->pi_lock); ++ raw_spin_lock(&rq->lock); ++ ++ rq->active_balance = 0; ++ /* _something_ may have changed the task, double check again */ ++ if (task_on_rq_queued(p) && task_rq(p) == rq && ++ cpumask_and(&tmp, p->cpus_ptr, &sched_sg_idle_mask)) ++ rq = move_queued_task(rq, p, __best_mask_cpu(cpu_of(rq), &tmp)); ++ ++ raw_spin_unlock(&rq->lock); ++ raw_spin_unlock(&p->pi_lock); ++ ++ local_irq_restore(flags); ++ ++ return 0; ++} ++ ++/* sg_balance_trigger - trigger slibing group balance for @cpu */ ++static inline int sg_balance_trigger(const int cpu, struct rq *rq) ++{ ++ unsigned long flags; ++ struct task_struct *curr; ++ int res; ++ ++ if (!raw_spin_trylock_irqsave(&rq->lock, flags)) ++ return 0; ++ curr = rq->curr; ++ res = (!is_idle_task(curr)) && (1 == rq->nr_running) &&\ ++ cpumask_intersects(curr->cpus_ptr, &sched_sg_idle_mask) &&\ ++ (!rq->active_balance); ++ ++ if (res) ++ rq->active_balance = 1; ++ ++ raw_spin_unlock_irqrestore(&rq->lock, flags); ++ ++ if (res) ++ stop_one_cpu_nowait(cpu, active_load_balance_cpu_stop, ++ curr, &rq->active_balance_work); ++ return res; ++} ++ ++/* ++ * sg_balance_check - slibing group balance check for run queue @rq ++ */ ++static inline void sg_balance_check(struct rq *rq) ++{ ++ cpumask_t chk; ++ int cpu; ++ ++ /* exit when no sg in idle */ ++ if (cpumask_empty(&sched_sg_idle_mask)) ++ return; ++ ++ cpu = cpu_of(rq); ++ /* Only cpu in slibing idle group will do the checking */ ++ if (cpumask_test_cpu(cpu, &sched_sg_idle_mask)) { ++ /* Find potential cpus which can migrate the currently running task */ ++ if (cpumask_andnot(&chk, cpu_online_mask, &sched_rq_pending_mask) && ++ cpumask_andnot(&chk, &chk, &sched_rq_watermark[IDLE_WM])) { ++ int i, tried = 0; ++ ++ for_each_cpu_wrap(i, &chk, cpu) { ++ /* skip the cpu which has idle slibing cpu */ ++ if (cpumask_intersects(cpu_smt_mask(i), ++ &sched_rq_watermark[IDLE_WM])) ++ continue; ++ if (cpumask_intersects(cpu_smt_mask(i), ++ &sched_rq_pending_mask)) ++ continue; ++ if (sg_balance_trigger(i, cpu_rq(i))) ++ return; ++ if (tried) ++ return; ++ tried++; ++ } ++ } ++ return; ++ } ++ ++ if (1 != rq->nr_running) ++ return; ++ ++ if (cpumask_andnot(&chk, cpu_smt_mask(cpu), &sched_rq_pending_mask) && ++ cpumask_andnot(&chk, &chk, &sched_rq_watermark[IDLE_WM]) && ++ cpumask_equal(&chk, cpu_smt_mask(cpu))) ++ sg_balance_trigger(cpu, rq); ++} ++#endif /* CONFIG_SCHED_SMT */ ++ ++/* ++ * This function gets called by the timer code, with HZ frequency. ++ * We call it with interrupts disabled. ++ */ ++void scheduler_tick(void) ++{ ++ int cpu __maybe_unused = smp_processor_id(); ++ struct rq *rq = cpu_rq(cpu); ++ ++ sched_clock_tick(); ++ ++ raw_spin_lock(&rq->lock); ++ update_rq_clock(rq); ++ ++ scheduler_task_tick(rq); ++ calc_global_load_tick(rq); ++ psi_task_tick(rq); ++ ++ rq->last_tick = rq->clock; ++ raw_spin_unlock(&rq->lock); ++ ++ perf_event_task_tick(); ++} ++ ++#ifdef CONFIG_NO_HZ_FULL ++struct tick_work { ++ int cpu; ++ atomic_t state; ++ struct delayed_work work; ++}; ++/* Values for ->state, see diagram below. */ ++#define TICK_SCHED_REMOTE_OFFLINE 0 ++#define TICK_SCHED_REMOTE_OFFLINING 1 ++#define TICK_SCHED_REMOTE_RUNNING 2 ++ ++/* ++ * State diagram for ->state: ++ * ++ * ++ * TICK_SCHED_REMOTE_OFFLINE ++ * | ^ ++ * | | ++ * | | sched_tick_remote() ++ * | | ++ * | | ++ * +--TICK_SCHED_REMOTE_OFFLINING ++ * | ^ ++ * | | ++ * sched_tick_start() | | sched_tick_stop() ++ * | | ++ * V | ++ * TICK_SCHED_REMOTE_RUNNING ++ * ++ * ++ * Other transitions get WARN_ON_ONCE(), except that sched_tick_remote() ++ * and sched_tick_start() are happy to leave the state in RUNNING. ++ */ ++ ++static struct tick_work __percpu *tick_work_cpu; ++ ++static void sched_tick_remote(struct work_struct *work) ++{ ++ struct delayed_work *dwork = to_delayed_work(work); ++ struct tick_work *twork = container_of(dwork, struct tick_work, work); ++ int cpu = twork->cpu; ++ struct rq *rq = cpu_rq(cpu); ++ struct task_struct *curr; ++ unsigned long flags; ++ u64 delta; ++ int os; ++ ++ /* ++ * Handle the tick only if it appears the remote CPU is running in full ++ * dynticks mode. The check is racy by nature, but missing a tick or ++ * having one too much is no big deal because the scheduler tick updates ++ * statistics and checks timeslices in a time-independent way, regardless ++ * of when exactly it is running. ++ */ ++ if (idle_cpu(cpu) || !tick_nohz_tick_stopped_cpu(cpu)) ++ goto out_requeue; ++ ++ raw_spin_lock_irqsave(&rq->lock, flags); ++ curr = rq->curr; ++ ++ if (is_idle_task(curr) || cpu_is_offline(cpu)) ++ goto out_unlock; ++ ++ update_rq_clock(rq); ++ delta = rq_clock_task(rq) - curr->last_ran; ++ ++ /* ++ * Make sure the next tick runs within a reasonable ++ * amount of time. ++ */ ++ WARN_ON_ONCE(delta > (u64)NSEC_PER_SEC * 3); ++ scheduler_task_tick(rq); ++ ++out_unlock: ++ raw_spin_unlock_irqrestore(&rq->lock, flags); ++ ++out_requeue: ++ /* ++ * Run the remote tick once per second (1Hz). This arbitrary ++ * frequency is large enough to avoid overload but short enough ++ * to keep scheduler internal stats reasonably up to date. But ++ * first update state to reflect hotplug activity if required. ++ */ ++ os = atomic_fetch_add_unless(&twork->state, -1, TICK_SCHED_REMOTE_RUNNING); ++ WARN_ON_ONCE(os == TICK_SCHED_REMOTE_OFFLINE); ++ if (os == TICK_SCHED_REMOTE_RUNNING) ++ queue_delayed_work(system_unbound_wq, dwork, HZ); ++} ++ ++static void sched_tick_start(int cpu) ++{ ++ int os; ++ struct tick_work *twork; ++ ++ if (housekeeping_cpu(cpu, HK_FLAG_TICK)) ++ return; ++ ++ WARN_ON_ONCE(!tick_work_cpu); ++ ++ twork = per_cpu_ptr(tick_work_cpu, cpu); ++ os = atomic_xchg(&twork->state, TICK_SCHED_REMOTE_RUNNING); ++ WARN_ON_ONCE(os == TICK_SCHED_REMOTE_RUNNING); ++ if (os == TICK_SCHED_REMOTE_OFFLINE) { ++ twork->cpu = cpu; ++ INIT_DELAYED_WORK(&twork->work, sched_tick_remote); ++ queue_delayed_work(system_unbound_wq, &twork->work, HZ); ++ } ++} ++ ++#ifdef CONFIG_HOTPLUG_CPU ++static void sched_tick_stop(int cpu) ++{ ++ struct tick_work *twork; ++ ++ if (housekeeping_cpu(cpu, HK_FLAG_TICK)) ++ return; ++ ++ WARN_ON_ONCE(!tick_work_cpu); ++ ++ twork = per_cpu_ptr(tick_work_cpu, cpu); ++ cancel_delayed_work_sync(&twork->work); ++} ++#endif /* CONFIG_HOTPLUG_CPU */ ++ ++int __init sched_tick_offload_init(void) ++{ ++ tick_work_cpu = alloc_percpu(struct tick_work); ++ BUG_ON(!tick_work_cpu); ++ return 0; ++} ++ ++#else /* !CONFIG_NO_HZ_FULL */ ++static inline void sched_tick_start(int cpu) { } ++static inline void sched_tick_stop(int cpu) { } ++#endif ++ ++#if defined(CONFIG_PREEMPTION) && (defined(CONFIG_DEBUG_PREEMPT) || \ ++ defined(CONFIG_PREEMPT_TRACER)) ++/* ++ * If the value passed in is equal to the current preempt count ++ * then we just disabled preemption. Start timing the latency. ++ */ ++static inline void preempt_latency_start(int val) ++{ ++ if (preempt_count() == val) { ++ unsigned long ip = get_lock_parent_ip(); ++#ifdef CONFIG_DEBUG_PREEMPT ++ current->preempt_disable_ip = ip; ++#endif ++ trace_preempt_off(CALLER_ADDR0, ip); ++ } ++} ++ ++void preempt_count_add(int val) ++{ ++#ifdef CONFIG_DEBUG_PREEMPT ++ /* ++ * Underflow? ++ */ ++ if (DEBUG_LOCKS_WARN_ON((preempt_count() < 0))) ++ return; ++#endif ++ __preempt_count_add(val); ++#ifdef CONFIG_DEBUG_PREEMPT ++ /* ++ * Spinlock count overflowing soon? ++ */ ++ DEBUG_LOCKS_WARN_ON((preempt_count() & PREEMPT_MASK) >= ++ PREEMPT_MASK - 10); ++#endif ++ preempt_latency_start(val); ++} ++EXPORT_SYMBOL(preempt_count_add); ++NOKPROBE_SYMBOL(preempt_count_add); ++ ++/* ++ * If the value passed in equals to the current preempt count ++ * then we just enabled preemption. Stop timing the latency. ++ */ ++static inline void preempt_latency_stop(int val) ++{ ++ if (preempt_count() == val) ++ trace_preempt_on(CALLER_ADDR0, get_lock_parent_ip()); ++} ++ ++void preempt_count_sub(int val) ++{ ++#ifdef CONFIG_DEBUG_PREEMPT ++ /* ++ * Underflow? ++ */ ++ if (DEBUG_LOCKS_WARN_ON(val > preempt_count())) ++ return; ++ /* ++ * Is the spinlock portion underflowing? ++ */ ++ if (DEBUG_LOCKS_WARN_ON((val < PREEMPT_MASK) && ++ !(preempt_count() & PREEMPT_MASK))) ++ return; ++#endif ++ ++ preempt_latency_stop(val); ++ __preempt_count_sub(val); ++} ++EXPORT_SYMBOL(preempt_count_sub); ++NOKPROBE_SYMBOL(preempt_count_sub); ++ ++#else ++static inline void preempt_latency_start(int val) { } ++static inline void preempt_latency_stop(int val) { } ++#endif ++ ++/* ++ * Timeslices below RESCHED_NS are considered as good as expired as there's no ++ * point rescheduling when there's so little time left. ++ */ ++static inline void check_curr(struct task_struct *p, struct rq *rq) ++{ ++ if (rq->idle == p) ++ return; ++ ++ update_curr(rq, p); ++ ++ if (p->time_slice < RESCHED_NS) { ++ p->time_slice = SCHED_TIMESLICE_NS; ++ if (SCHED_FIFO != p->policy && task_on_rq_queued(p)) { ++ if (SCHED_RR != p->policy) ++ deboost_task(p); ++ requeue_task(p, rq); ++ } ++ } ++} ++ ++#ifdef CONFIG_SMP ++ ++#define SCHED_RQ_NR_MIGRATION (32UL) ++/* ++ * Migrate pending tasks in @rq to @dest_cpu ++ * Will try to migrate mininal of half of @rq nr_running tasks and ++ * SCHED_RQ_NR_MIGRATION to @dest_cpu ++ */ ++static inline int ++migrate_pending_tasks(struct rq *rq, struct rq *dest_rq, const int dest_cpu) ++{ ++ struct task_struct *p, *skip = rq->curr; ++ int nr_migrated = 0; ++ int nr_tries = min(rq->nr_running / 2, SCHED_RQ_NR_MIGRATION); ++ ++ while (skip != rq->idle && nr_tries && ++ (p = rq_next_bmq_task(skip, rq)) != rq->idle) { ++ skip = rq_next_bmq_task(p, rq); ++ if (cpumask_test_cpu(dest_cpu, p->cpus_ptr)) { ++ dequeue_task(p, rq, 0); ++ set_task_cpu(p, dest_cpu); ++ enqueue_task(p, dest_rq, 0); ++ nr_migrated++; ++ } ++ nr_tries--; ++ } ++ ++ return nr_migrated; ++} ++ ++static inline int take_other_rq_tasks(struct rq *rq, int cpu) ++{ ++ struct cpumask *affinity_mask, *end_mask; ++ ++ if (cpumask_empty(&sched_rq_pending_mask)) ++ return 0; ++ ++ affinity_mask = &(per_cpu(sched_cpu_affinity_masks, cpu)[0]); ++ end_mask = per_cpu(sched_cpu_affinity_end_mask, cpu); ++ do { ++ int i; ++ for_each_cpu_and(i, &sched_rq_pending_mask, affinity_mask) { ++ int nr_migrated; ++ struct rq *src_rq; ++ ++ src_rq = cpu_rq(i); ++ if (!do_raw_spin_trylock(&src_rq->lock)) ++ continue; ++ spin_acquire(&src_rq->lock.dep_map, ++ SINGLE_DEPTH_NESTING, 1, _RET_IP_); ++ ++ nr_migrated = migrate_pending_tasks(src_rq, rq, cpu); ++ ++ spin_release(&src_rq->lock.dep_map, 1, _RET_IP_); ++ do_raw_spin_unlock(&src_rq->lock); ++ ++ if (nr_migrated) { ++ cpufreq_update_util(rq, 0); ++ return 1; ++ } ++ } ++ } while (++affinity_mask < end_mask); ++ ++ return 0; ++} ++#endif ++ ++static inline struct task_struct * ++choose_next_task(struct rq *rq, int cpu, struct task_struct *prev) ++{ ++ struct task_struct *next; ++ ++ if (unlikely(rq->skip)) { ++ next = rq_runnable_task(rq); ++#ifdef CONFIG_SMP ++ if (likely(rq->online)) ++ if (next == rq->idle && take_other_rq_tasks(rq, cpu)) ++ next = rq_runnable_task(rq); ++#endif ++ rq->skip = NULL; ++ return next; ++ } ++ ++ next = rq_first_bmq_task(rq); ++#ifdef CONFIG_SMP ++ if (likely(rq->online)) ++ if (next == rq->idle && take_other_rq_tasks(rq, cpu)) ++ return rq_first_bmq_task(rq); ++#endif ++ return next; ++} ++ ++static inline unsigned long get_preempt_disable_ip(struct task_struct *p) ++{ ++#ifdef CONFIG_DEBUG_PREEMPT ++ return p->preempt_disable_ip; ++#else ++ return 0; ++#endif ++} ++ ++/* ++ * Print scheduling while atomic bug: ++ */ ++static noinline void __schedule_bug(struct task_struct *prev) ++{ ++ /* Save this before calling printk(), since that will clobber it */ ++ unsigned long preempt_disable_ip = get_preempt_disable_ip(current); ++ ++ if (oops_in_progress) ++ return; ++ ++ printk(KERN_ERR "BUG: scheduling while atomic: %s/%d/0x%08x\n", ++ prev->comm, prev->pid, preempt_count()); ++ ++ debug_show_held_locks(prev); ++ print_modules(); ++ if (irqs_disabled()) ++ print_irqtrace_events(prev); ++ if (IS_ENABLED(CONFIG_DEBUG_PREEMPT) ++ && in_atomic_preempt_off()) { ++ pr_err("Preemption disabled at:"); ++ print_ip_sym(preempt_disable_ip); ++ pr_cont("\n"); ++ } ++ if (panic_on_warn) ++ panic("scheduling while atomic\n"); ++ ++ dump_stack(); ++ add_taint(TAINT_WARN, LOCKDEP_STILL_OK); ++} ++ ++/* ++ * Various schedule()-time debugging checks and statistics: ++ */ ++static inline void schedule_debug(struct task_struct *prev, bool preempt) ++{ ++#ifdef CONFIG_SCHED_STACK_END_CHECK ++ if (task_stack_end_corrupted(prev)) ++ panic("corrupted stack end detected inside scheduler\n"); ++#endif ++ ++#ifdef CONFIG_DEBUG_ATOMIC_SLEEP ++ if (!preempt && prev->state && prev->non_block_count) { ++ printk(KERN_ERR "BUG: scheduling in a non-blocking section: %s/%d/%i\n", ++ prev->comm, prev->pid, prev->non_block_count); ++ dump_stack(); ++ add_taint(TAINT_WARN, LOCKDEP_STILL_OK); ++ } ++#endif ++ ++ if (unlikely(in_atomic_preempt_off())) { ++ __schedule_bug(prev); ++ preempt_count_set(PREEMPT_DISABLED); ++ } ++ rcu_sleep_check(); ++ ++ profile_hit(SCHED_PROFILING, __builtin_return_address(0)); ++ ++ schedstat_inc(this_rq()->sched_count); ++} ++ ++static inline void set_rq_task(struct rq *rq, struct task_struct *p) ++{ ++ p->last_ran = rq->clock_task; ++ ++ if (unlikely(SCHED_TIMESLICE_NS == p->time_slice)) ++ rq->last_ts_switch = rq->clock; ++#ifdef CONFIG_HIGH_RES_TIMERS ++ if (p != rq->idle) ++ hrtick_start(rq, p->time_slice); ++#endif ++} ++ ++/* ++ * schedule() is the main scheduler function. ++ * ++ * The main means of driving the scheduler and thus entering this function are: ++ * ++ * 1. Explicit blocking: mutex, semaphore, waitqueue, etc. ++ * ++ * 2. TIF_NEED_RESCHED flag is checked on interrupt and userspace return ++ * paths. For example, see arch/x86/entry_64.S. ++ * ++ * To drive preemption between tasks, the scheduler sets the flag in timer ++ * interrupt handler scheduler_tick(). ++ * ++ * 3. Wakeups don't really cause entry into schedule(). They add a ++ * task to the run-queue and that's it. ++ * ++ * Now, if the new task added to the run-queue preempts the current ++ * task, then the wakeup sets TIF_NEED_RESCHED and schedule() gets ++ * called on the nearest possible occasion: ++ * ++ * - If the kernel is preemptible (CONFIG_PREEMPTION=y): ++ * ++ * - in syscall or exception context, at the next outmost ++ * preempt_enable(). (this might be as soon as the wake_up()'s ++ * spin_unlock()!) ++ * ++ * - in IRQ context, return from interrupt-handler to ++ * preemptible context ++ * ++ * - If the kernel is not preemptible (CONFIG_PREEMPTION is not set) ++ * then at the next: ++ * ++ * - cond_resched() call ++ * - explicit schedule() call ++ * - return from syscall or exception to user-space ++ * - return from interrupt-handler to user-space ++ * ++ * WARNING: must be called with preemption disabled! ++ */ ++static void __sched notrace __schedule(bool preempt) ++{ ++ struct task_struct *prev, *next; ++ unsigned long *switch_count; ++ struct rq *rq; ++ int cpu; ++ ++ cpu = smp_processor_id(); ++ rq = cpu_rq(cpu); ++ prev = rq->curr; ++ ++ schedule_debug(prev, preempt); ++ ++ /* by passing sched_feat(HRTICK) checking which BMQ doesn't support */ ++ hrtick_clear(rq); ++ ++ local_irq_disable(); ++ rcu_note_context_switch(preempt); ++ ++ /* ++ * Make sure that signal_pending_state()->signal_pending() below ++ * can't be reordered with __set_current_state(TASK_INTERRUPTIBLE) ++ * done by the caller to avoid the race with signal_wake_up(). ++ * ++ * The membarrier system call requires a full memory barrier ++ * after coming from user-space, before storing to rq->curr. ++ */ ++ raw_spin_lock(&rq->lock); ++ smp_mb__after_spinlock(); ++ ++ update_rq_clock(rq); ++ ++ switch_count = &prev->nivcsw; ++ if (!preempt && prev->state) { ++ if (signal_pending_state(prev->state, prev)) { ++ prev->state = TASK_RUNNING; ++ } else { ++ if (rq_switch_time(rq) < boost_threshold(prev)) ++ boost_task(prev); ++ deactivate_task(prev, rq); ++ ++ if (prev->in_iowait) { ++ atomic_inc(&rq->nr_iowait); ++ delayacct_blkio_start(); ++ } ++ } ++ switch_count = &prev->nvcsw; ++ } ++ ++ clear_tsk_need_resched(prev); ++ clear_preempt_need_resched(); ++ ++ check_curr(prev, rq); ++ ++ next = choose_next_task(rq, cpu, prev); ++ ++ set_rq_task(rq, next); ++ ++ if (prev != next) { ++ if (MAX_PRIO == next->prio) ++ schedstat_inc(rq->sched_goidle); ++ ++ /* ++ * RCU users of rcu_dereference(rq->curr) may not see ++ * changes to task_struct made by pick_next_task(). ++ */ ++ RCU_INIT_POINTER(rq->curr, next); ++ /* ++ * The membarrier system call requires each architecture ++ * to have a full memory barrier after updating ++ * rq->curr, before returning to user-space. ++ * ++ * Here are the schemes providing that barrier on the ++ * various architectures: ++ * - mm ? switch_mm() : mmdrop() for x86, s390, sparc, PowerPC. ++ * switch_mm() rely on membarrier_arch_switch_mm() on PowerPC. ++ * - finish_lock_switch() for weakly-ordered ++ * architectures where spin_unlock is a full barrier, ++ * - switch_to() for arm64 (weakly-ordered, spin_unlock ++ * is a RELEASE barrier), ++ */ ++ ++*switch_count; ++ rq->nr_switches++; ++ rq->last_ts_switch = rq->clock; ++ ++ trace_sched_switch(preempt, prev, next); ++ ++ /* Also unlocks the rq: */ ++ rq = context_switch(rq, prev, next); ++#ifdef CONFIG_SCHED_SMT ++ sg_balance_check(rq); ++#endif ++ } else ++ raw_spin_unlock_irq(&rq->lock); ++} ++ ++void __noreturn do_task_dead(void) ++{ ++ /* Causes final put_task_struct in finish_task_switch(): */ ++ set_special_state(TASK_DEAD); ++ ++ /* Tell freezer to ignore us: */ ++ current->flags |= PF_NOFREEZE; ++ __schedule(false); ++ ++ BUG(); ++ ++ /* Avoid "noreturn function does return" - but don't continue if BUG() is a NOP: */ ++ for (;;) ++ cpu_relax(); ++} ++ ++static inline void sched_submit_work(struct task_struct *tsk) ++{ ++ if (!tsk->state) ++ return; ++ ++ /* ++ * If a worker went to sleep, notify and ask workqueue whether ++ * it wants to wake up a task to maintain concurrency. ++ * As this function is called inside the schedule() context, ++ * we disable preemption to avoid it calling schedule() again ++ * in the possible wakeup of a kworker. ++ */ ++ if (tsk->flags & PF_WQ_WORKER) { ++ preempt_disable(); ++ wq_worker_sleeping(tsk); ++ preempt_enable_no_resched(); ++ } ++ ++ if (tsk_is_pi_blocked(tsk)) ++ return; ++ ++ /* ++ * If we are going to sleep and we have plugged IO queued, ++ * make sure to submit it to avoid deadlocks. ++ */ ++ if (blk_needs_flush_plug(tsk)) ++ blk_schedule_flush_plug(tsk); ++} ++ ++static void sched_update_worker(struct task_struct *tsk) ++{ ++ if (tsk->flags & PF_WQ_WORKER) ++ wq_worker_running(tsk); ++} ++ ++asmlinkage __visible void __sched schedule(void) ++{ ++ struct task_struct *tsk = current; ++ ++ sched_submit_work(tsk); ++ do { ++ preempt_disable(); ++ __schedule(false); ++ sched_preempt_enable_no_resched(); ++ } while (need_resched()); ++ sched_update_worker(tsk); ++} ++EXPORT_SYMBOL(schedule); ++ ++/* ++ * synchronize_rcu_tasks() makes sure that no task is stuck in preempted ++ * state (have scheduled out non-voluntarily) by making sure that all ++ * tasks have either left the run queue or have gone into user space. ++ * As idle tasks do not do either, they must not ever be preempted ++ * (schedule out non-voluntarily). ++ * ++ * schedule_idle() is similar to schedule_preempt_disable() except that it ++ * never enables preemption because it does not call sched_submit_work(). ++ */ ++void __sched schedule_idle(void) ++{ ++ /* ++ * As this skips calling sched_submit_work(), which the idle task does ++ * regardless because that function is a nop when the task is in a ++ * TASK_RUNNING state, make sure this isn't used someplace that the ++ * current task can be in any other state. Note, idle is always in the ++ * TASK_RUNNING state. ++ */ ++ WARN_ON_ONCE(current->state); ++ do { ++ __schedule(false); ++ } while (need_resched()); ++} ++ ++#ifdef CONFIG_CONTEXT_TRACKING ++asmlinkage __visible void __sched schedule_user(void) ++{ ++ /* ++ * If we come here after a random call to set_need_resched(), ++ * or we have been woken up remotely but the IPI has not yet arrived, ++ * we haven't yet exited the RCU idle mode. Do it here manually until ++ * we find a better solution. ++ * ++ * NB: There are buggy callers of this function. Ideally we ++ * should warn if prev_state != CONTEXT_USER, but that will trigger ++ * too frequently to make sense yet. ++ */ ++ enum ctx_state prev_state = exception_enter(); ++ schedule(); ++ exception_exit(prev_state); ++} ++#endif ++ ++/** ++ * schedule_preempt_disabled - called with preemption disabled ++ * ++ * Returns with preemption disabled. Note: preempt_count must be 1 ++ */ ++void __sched schedule_preempt_disabled(void) ++{ ++ sched_preempt_enable_no_resched(); ++ schedule(); ++ preempt_disable(); ++} ++ ++static void __sched notrace preempt_schedule_common(void) ++{ ++ do { ++ /* ++ * Because the function tracer can trace preempt_count_sub() ++ * and it also uses preempt_enable/disable_notrace(), if ++ * NEED_RESCHED is set, the preempt_enable_notrace() called ++ * by the function tracer will call this function again and ++ * cause infinite recursion. ++ * ++ * Preemption must be disabled here before the function ++ * tracer can trace. Break up preempt_disable() into two ++ * calls. One to disable preemption without fear of being ++ * traced. The other to still record the preemption latency, ++ * which can also be traced by the function tracer. ++ */ ++ preempt_disable_notrace(); ++ preempt_latency_start(1); ++ __schedule(true); ++ preempt_latency_stop(1); ++ preempt_enable_no_resched_notrace(); ++ ++ /* ++ * Check again in case we missed a preemption opportunity ++ * between schedule and now. ++ */ ++ } while (need_resched()); ++} ++ ++#ifdef CONFIG_PREEMPTION ++/* ++ * This is the entry point to schedule() from in-kernel preemption ++ * off of preempt_enable. ++ */ ++asmlinkage __visible void __sched notrace preempt_schedule(void) ++{ ++ /* ++ * If there is a non-zero preempt_count or interrupts are disabled, ++ * we do not want to preempt the current task. Just return.. ++ */ ++ if (likely(!preemptible())) ++ return; ++ ++ preempt_schedule_common(); ++} ++NOKPROBE_SYMBOL(preempt_schedule); ++EXPORT_SYMBOL(preempt_schedule); ++ ++/** ++ * preempt_schedule_notrace - preempt_schedule called by tracing ++ * ++ * The tracing infrastructure uses preempt_enable_notrace to prevent ++ * recursion and tracing preempt enabling caused by the tracing ++ * infrastructure itself. But as tracing can happen in areas coming ++ * from userspace or just about to enter userspace, a preempt enable ++ * can occur before user_exit() is called. This will cause the scheduler ++ * to be called when the system is still in usermode. ++ * ++ * To prevent this, the preempt_enable_notrace will use this function ++ * instead of preempt_schedule() to exit user context if needed before ++ * calling the scheduler. ++ */ ++asmlinkage __visible void __sched notrace preempt_schedule_notrace(void) ++{ ++ enum ctx_state prev_ctx; ++ ++ if (likely(!preemptible())) ++ return; ++ ++ do { ++ /* ++ * Because the function tracer can trace preempt_count_sub() ++ * and it also uses preempt_enable/disable_notrace(), if ++ * NEED_RESCHED is set, the preempt_enable_notrace() called ++ * by the function tracer will call this function again and ++ * cause infinite recursion. ++ * ++ * Preemption must be disabled here before the function ++ * tracer can trace. Break up preempt_disable() into two ++ * calls. One to disable preemption without fear of being ++ * traced. The other to still record the preemption latency, ++ * which can also be traced by the function tracer. ++ */ ++ preempt_disable_notrace(); ++ preempt_latency_start(1); ++ /* ++ * Needs preempt disabled in case user_exit() is traced ++ * and the tracer calls preempt_enable_notrace() causing ++ * an infinite recursion. ++ */ ++ prev_ctx = exception_enter(); ++ __schedule(true); ++ exception_exit(prev_ctx); ++ ++ preempt_latency_stop(1); ++ preempt_enable_no_resched_notrace(); ++ } while (need_resched()); ++} ++EXPORT_SYMBOL_GPL(preempt_schedule_notrace); ++ ++#endif /* CONFIG_PREEMPTION */ ++ ++/* ++ * This is the entry point to schedule() from kernel preemption ++ * off of irq context. ++ * Note, that this is called and return with irqs disabled. This will ++ * protect us against recursive calling from irq. ++ */ ++asmlinkage __visible void __sched preempt_schedule_irq(void) ++{ ++ enum ctx_state prev_state; ++ ++ /* Catch callers which need to be fixed */ ++ BUG_ON(preempt_count() || !irqs_disabled()); ++ ++ prev_state = exception_enter(); ++ ++ do { ++ preempt_disable(); ++ local_irq_enable(); ++ __schedule(true); ++ local_irq_disable(); ++ sched_preempt_enable_no_resched(); ++ } while (need_resched()); ++ ++ exception_exit(prev_state); ++} ++ ++int default_wake_function(wait_queue_entry_t *curr, unsigned mode, int wake_flags, ++ void *key) ++{ ++ return try_to_wake_up(curr->private, mode, wake_flags); ++} ++EXPORT_SYMBOL(default_wake_function); ++ ++static inline void check_task_changed(struct rq *rq, struct task_struct *p) ++{ ++ /* Trigger resched if task sched_prio has been modified. */ ++ if (task_on_rq_queued(p) && task_sched_prio(p) != p->bmq_idx) { ++ requeue_task(p, rq); ++ check_preempt_curr(rq); ++ } ++} ++ ++#ifdef CONFIG_RT_MUTEXES ++ ++static inline int __rt_effective_prio(struct task_struct *pi_task, int prio) ++{ ++ if (pi_task) ++ prio = min(prio, pi_task->prio); ++ ++ return prio; ++} ++ ++static inline int rt_effective_prio(struct task_struct *p, int prio) ++{ ++ struct task_struct *pi_task = rt_mutex_get_top_task(p); ++ ++ return __rt_effective_prio(pi_task, prio); ++} ++ ++/* ++ * rt_mutex_setprio - set the current priority of a task ++ * @p: task to boost ++ * @pi_task: donor task ++ * ++ * This function changes the 'effective' priority of a task. It does ++ * not touch ->normal_prio like __setscheduler(). ++ * ++ * Used by the rt_mutex code to implement priority inheritance ++ * logic. Call site only calls if the priority of the task changed. ++ */ ++void rt_mutex_setprio(struct task_struct *p, struct task_struct *pi_task) ++{ ++ int prio; ++ struct rq *rq; ++ raw_spinlock_t *lock; ++ ++ /* XXX used to be waiter->prio, not waiter->task->prio */ ++ prio = __rt_effective_prio(pi_task, p->normal_prio); ++ ++ /* ++ * If nothing changed; bail early. ++ */ ++ if (p->pi_top_task == pi_task && prio == p->prio) ++ return; ++ ++ rq = __task_access_lock(p, &lock); ++ /* ++ * Set under pi_lock && rq->lock, such that the value can be used under ++ * either lock. ++ * ++ * Note that there is loads of tricky to make this pointer cache work ++ * right. rt_mutex_slowunlock()+rt_mutex_postunlock() work together to ++ * ensure a task is de-boosted (pi_task is set to NULL) before the ++ * task is allowed to run again (and can exit). This ensures the pointer ++ * points to a blocked task -- which guaratees the task is present. ++ */ ++ p->pi_top_task = pi_task; ++ ++ /* ++ * For FIFO/RR we only need to set prio, if that matches we're done. ++ */ ++ if (prio == p->prio) ++ goto out_unlock; ++ ++ /* ++ * Idle task boosting is a nono in general. There is one ++ * exception, when PREEMPT_RT and NOHZ is active: ++ * ++ * The idle task calls get_next_timer_interrupt() and holds ++ * the timer wheel base->lock on the CPU and another CPU wants ++ * to access the timer (probably to cancel it). We can safely ++ * ignore the boosting request, as the idle CPU runs this code ++ * with interrupts disabled and will complete the lock ++ * protected section without being interrupted. So there is no ++ * real need to boost. ++ */ ++ if (unlikely(p == rq->idle)) { ++ WARN_ON(p != rq->curr); ++ WARN_ON(p->pi_blocked_on); ++ goto out_unlock; ++ } ++ ++ trace_sched_pi_setprio(p, pi_task); ++ p->prio = prio; ++ ++ check_task_changed(rq, p); ++out_unlock: ++ __task_access_unlock(p, lock); ++} ++#else ++static inline int rt_effective_prio(struct task_struct *p, int prio) ++{ ++ return prio; ++} ++#endif ++ ++void set_user_nice(struct task_struct *p, long nice) ++{ ++ unsigned long flags; ++ struct rq *rq; ++ raw_spinlock_t *lock; ++ ++ if (task_nice(p) == nice || nice < MIN_NICE || nice > MAX_NICE) ++ return; ++ /* ++ * We have to be careful, if called from sys_setpriority(), ++ * the task might be in the middle of scheduling on another CPU. ++ */ ++ raw_spin_lock_irqsave(&p->pi_lock, flags); ++ rq = __task_access_lock(p, &lock); ++ ++ p->static_prio = NICE_TO_PRIO(nice); ++ /* ++ * The RT priorities are set via sched_setscheduler(), but we still ++ * allow the 'normal' nice value to be set - but as expected ++ * it wont have any effect on scheduling until the task is ++ * not SCHED_NORMAL/SCHED_BATCH: ++ */ ++ if (task_has_rt_policy(p)) ++ goto out_unlock; ++ ++ p->prio = effective_prio(p); ++ check_task_changed(rq, p); ++out_unlock: ++ __task_access_unlock(p, lock); ++ raw_spin_unlock_irqrestore(&p->pi_lock, flags); ++} ++EXPORT_SYMBOL(set_user_nice); ++ ++/* ++ * can_nice - check if a task can reduce its nice value ++ * @p: task ++ * @nice: nice value ++ */ ++int can_nice(const struct task_struct *p, const int nice) ++{ ++ /* Convert nice value [19,-20] to rlimit style value [1,40] */ ++ int nice_rlim = nice_to_rlimit(nice); ++ ++ return (nice_rlim <= task_rlimit(p, RLIMIT_NICE) || ++ capable(CAP_SYS_NICE)); ++} ++ ++#ifdef __ARCH_WANT_SYS_NICE ++ ++/* ++ * sys_nice - change the priority of the current process. ++ * @increment: priority increment ++ * ++ * sys_setpriority is a more generic, but much slower function that ++ * does similar things. ++ */ ++SYSCALL_DEFINE1(nice, int, increment) ++{ ++ long nice, retval; ++ ++ /* ++ * Setpriority might change our priority at the same moment. ++ * We don't have to worry. Conceptually one call occurs first ++ * and we have a single winner. ++ */ ++ ++ increment = clamp(increment, -NICE_WIDTH, NICE_WIDTH); ++ nice = task_nice(current) + increment; ++ ++ nice = clamp_val(nice, MIN_NICE, MAX_NICE); ++ if (increment < 0 && !can_nice(current, nice)) ++ return -EPERM; ++ ++ retval = security_task_setnice(current, nice); ++ if (retval) ++ return retval; ++ ++ set_user_nice(current, nice); ++ return 0; ++} ++ ++#endif ++ ++/** ++ * task_prio - return the priority value of a given task. ++ * @p: the task in question. ++ * ++ * Return: The priority value as seen by users in /proc. ++ * RT tasks are offset by -100. Normal tasks are centered around 1, value goes ++ * from 0(SCHED_ISO) up to 82 (nice +19 SCHED_IDLE). ++ */ ++int task_prio(const struct task_struct *p) ++{ ++ if (p->prio < MAX_RT_PRIO) ++ return (p->prio - MAX_RT_PRIO); ++ return (p->prio - MAX_RT_PRIO + p->boost_prio); ++} ++ ++/** ++ * idle_cpu - is a given CPU idle currently? ++ * @cpu: the processor in question. ++ * ++ * Return: 1 if the CPU is currently idle. 0 otherwise. ++ */ ++int idle_cpu(int cpu) ++{ ++ return cpu_curr(cpu) == cpu_rq(cpu)->idle; ++} ++ ++/** ++ * idle_task - return the idle task for a given CPU. ++ * @cpu: the processor in question. ++ * ++ * Return: The idle task for the cpu @cpu. ++ */ ++struct task_struct *idle_task(int cpu) ++{ ++ return cpu_rq(cpu)->idle; ++} ++ ++/** ++ * find_process_by_pid - find a process with a matching PID value. ++ * @pid: the pid in question. ++ * ++ * The task of @pid, if found. %NULL otherwise. ++ */ ++static inline struct task_struct *find_process_by_pid(pid_t pid) ++{ ++ return pid ? find_task_by_vpid(pid) : current; ++} ++ ++#ifdef CONFIG_SMP ++void sched_set_stop_task(int cpu, struct task_struct *stop) ++{ ++ struct sched_param stop_param = { .sched_priority = STOP_PRIO }; ++ struct sched_param start_param = { .sched_priority = 0 }; ++ struct task_struct *old_stop = cpu_rq(cpu)->stop; ++ ++ if (stop) { ++ /* ++ * Make it appear like a SCHED_FIFO task, its something ++ * userspace knows about and won't get confused about. ++ * ++ * Also, it will make PI more or less work without too ++ * much confusion -- but then, stop work should not ++ * rely on PI working anyway. ++ */ ++ sched_setscheduler_nocheck(stop, SCHED_FIFO, &stop_param); ++ } ++ ++ cpu_rq(cpu)->stop = stop; ++ ++ if (old_stop) { ++ /* ++ * Reset it back to a normal scheduling policy so that ++ * it can die in pieces. ++ */ ++ sched_setscheduler_nocheck(old_stop, SCHED_NORMAL, &start_param); ++ } ++} ++ ++/* ++ * Change a given task's CPU affinity. Migrate the thread to a ++ * proper CPU and schedule it away if the CPU it's executing on ++ * is removed from the allowed bitmask. ++ * ++ * NOTE: the caller must have a valid reference to the task, the ++ * task must not exit() & deallocate itself prematurely. The ++ * call is not atomic; no spinlocks may be held. ++ */ ++static int __set_cpus_allowed_ptr(struct task_struct *p, ++ const struct cpumask *new_mask, bool check) ++{ ++ const struct cpumask *cpu_valid_mask = cpu_active_mask; ++ int dest_cpu; ++ unsigned long flags; ++ struct rq *rq; ++ raw_spinlock_t *lock; ++ int ret = 0; ++ ++ raw_spin_lock_irqsave(&p->pi_lock, flags); ++ rq = __task_access_lock(p, &lock); ++ ++ if (p->flags & PF_KTHREAD) { ++ /* ++ * Kernel threads are allowed on online && !active CPUs ++ */ ++ cpu_valid_mask = cpu_online_mask; ++ } ++ ++ /* ++ * Must re-check here, to close a race against __kthread_bind(), ++ * sched_setaffinity() is not guaranteed to observe the flag. ++ */ ++ if (check && (p->flags & PF_NO_SETAFFINITY)) { ++ ret = -EINVAL; ++ goto out; ++ } ++ ++ if (cpumask_equal(p->cpus_ptr, new_mask)) ++ goto out; ++ ++ dest_cpu = cpumask_any_and(cpu_valid_mask, new_mask); ++ if (dest_cpu >= nr_cpu_ids) { ++ ret = -EINVAL; ++ goto out; ++ } ++ ++ do_set_cpus_allowed(p, new_mask); ++ ++ if (p->flags & PF_KTHREAD) { ++ /* ++ * For kernel threads that do indeed end up on online && ++ * !active we want to ensure they are strict per-CPU threads. ++ */ ++ WARN_ON(cpumask_intersects(new_mask, cpu_online_mask) && ++ !cpumask_intersects(new_mask, cpu_active_mask) && ++ p->nr_cpus_allowed != 1); ++ } ++ ++ /* Can the task run on the task's current CPU? If so, we're done */ ++ if (cpumask_test_cpu(task_cpu(p), new_mask)) ++ goto out; ++ ++ if (task_running(p) || p->state == TASK_WAKING) { ++ struct migration_arg arg = { p, dest_cpu }; ++ ++ /* Need help from migration thread: drop lock and wait. */ ++ __task_access_unlock(p, lock); ++ raw_spin_unlock_irqrestore(&p->pi_lock, flags); ++ stop_one_cpu(cpu_of(rq), migration_cpu_stop, &arg); ++ return 0; ++ } ++ if (task_on_rq_queued(p)) { ++ /* ++ * OK, since we're going to drop the lock immediately ++ * afterwards anyway. ++ */ ++ update_rq_clock(rq); ++ rq = move_queued_task(rq, p, dest_cpu); ++ lock = &rq->lock; ++ } ++ ++out: ++ __task_access_unlock(p, lock); ++ raw_spin_unlock_irqrestore(&p->pi_lock, flags); ++ ++ return ret; ++} ++ ++int set_cpus_allowed_ptr(struct task_struct *p, const struct cpumask *new_mask) ++{ ++ return __set_cpus_allowed_ptr(p, new_mask, false); ++} ++EXPORT_SYMBOL_GPL(set_cpus_allowed_ptr); ++ ++#else ++static inline int ++__set_cpus_allowed_ptr(struct task_struct *p, ++ const struct cpumask *new_mask, bool check) ++{ ++ return set_cpus_allowed_ptr(p, new_mask); ++} ++#endif ++ ++/* ++ * sched_setparam() passes in -1 for its policy, to let the functions ++ * it calls know not to change it. ++ */ ++#define SETPARAM_POLICY -1 ++ ++static void __setscheduler_params(struct task_struct *p, ++ const struct sched_attr *attr) ++{ ++ int policy = attr->sched_policy; ++ ++ if (policy == SETPARAM_POLICY) ++ policy = p->policy; ++ ++ p->policy = policy; ++ ++ /* ++ * allow normal nice value to be set, but will not have any ++ * effect on scheduling until the task not SCHED_NORMAL/ ++ * SCHED_BATCH ++ */ ++ p->static_prio = NICE_TO_PRIO(attr->sched_nice); ++ ++ /* ++ * __sched_setscheduler() ensures attr->sched_priority == 0 when ++ * !rt_policy. Always setting this ensures that things like ++ * getparam()/getattr() don't report silly values for !rt tasks. ++ */ ++ p->rt_priority = attr->sched_priority; ++ p->normal_prio = normal_prio(p); ++} ++ ++/* Actually do priority change: must hold rq lock. */ ++static void __setscheduler(struct rq *rq, struct task_struct *p, ++ const struct sched_attr *attr, bool keep_boost) ++{ ++ __setscheduler_params(p, attr); ++ ++ /* ++ * Keep a potential priority boosting if called from ++ * sched_setscheduler(). ++ */ ++ p->prio = normal_prio(p); ++ if (keep_boost) ++ p->prio = rt_effective_prio(p, p->prio); ++} ++ ++/* ++ * check the target process has a UID that matches the current process's ++ */ ++static bool check_same_owner(struct task_struct *p) ++{ ++ const struct cred *cred = current_cred(), *pcred; ++ bool match; ++ ++ rcu_read_lock(); ++ pcred = __task_cred(p); ++ match = (uid_eq(cred->euid, pcred->euid) || ++ uid_eq(cred->euid, pcred->uid)); ++ rcu_read_unlock(); ++ return match; ++} ++ ++static int __sched_setscheduler(struct task_struct *p, ++ const struct sched_attr *attr, ++ bool user, bool pi) ++{ ++ const struct sched_attr dl_squash_attr = { ++ .size = sizeof(struct sched_attr), ++ .sched_policy = SCHED_FIFO, ++ .sched_nice = 0, ++ .sched_priority = 99, ++ }; ++ int newprio = MAX_RT_PRIO - 1 - attr->sched_priority; ++ int retval, oldpolicy = -1; ++ int policy = attr->sched_policy; ++ unsigned long flags; ++ struct rq *rq; ++ int reset_on_fork; ++ raw_spinlock_t *lock; ++ ++ /* The pi code expects interrupts enabled */ ++ BUG_ON(pi && in_interrupt()); ++ ++ /* ++ * BMQ supports SCHED_DEADLINE by squash it as prio 0 SCHED_FIFO ++ */ ++ if (unlikely(SCHED_DEADLINE == policy)) { ++ attr = &dl_squash_attr; ++ policy = attr->sched_policy; ++ newprio = MAX_RT_PRIO - 1 - attr->sched_priority; ++ } ++recheck: ++ /* Double check policy once rq lock held */ ++ if (policy < 0) { ++ reset_on_fork = p->sched_reset_on_fork; ++ policy = oldpolicy = p->policy; ++ } else { ++ reset_on_fork = !!(attr->sched_flags & SCHED_RESET_ON_FORK); ++ ++ if (policy > SCHED_IDLE) ++ return -EINVAL; ++ } ++ ++ if (attr->sched_flags & ~(SCHED_FLAG_ALL)) ++ return -EINVAL; ++ ++ /* ++ * Valid priorities for SCHED_FIFO and SCHED_RR are ++ * 1..MAX_USER_RT_PRIO-1, valid priority for SCHED_NORMAL and ++ * SCHED_BATCH and SCHED_IDLE is 0. ++ */ ++ if (attr->sched_priority < 0 || ++ (p->mm && attr->sched_priority > MAX_USER_RT_PRIO - 1) || ++ (!p->mm && attr->sched_priority > MAX_RT_PRIO - 1)) ++ return -EINVAL; ++ if ((SCHED_RR == policy || SCHED_FIFO == policy) != ++ (attr->sched_priority != 0)) ++ return -EINVAL; ++ ++ /* ++ * Allow unprivileged RT tasks to decrease priority: ++ */ ++ if (user && !capable(CAP_SYS_NICE)) { ++ if (SCHED_FIFO == policy || SCHED_RR == policy) { ++ unsigned long rlim_rtprio = ++ task_rlimit(p, RLIMIT_RTPRIO); ++ ++ /* Can't set/change the rt policy */ ++ if (policy != p->policy && !rlim_rtprio) ++ return -EPERM; ++ ++ /* Can't increase priority */ ++ if (attr->sched_priority > p->rt_priority && ++ attr->sched_priority > rlim_rtprio) ++ return -EPERM; ++ } ++ ++ /* Can't change other user's priorities */ ++ if (!check_same_owner(p)) ++ return -EPERM; ++ ++ /* Normal users shall not reset the sched_reset_on_fork flag */ ++ if (p->sched_reset_on_fork && !reset_on_fork) ++ return -EPERM; ++ } ++ ++ if (user) { ++ retval = security_task_setscheduler(p); ++ if (retval) ++ return retval; ++ } ++ ++ if (pi) ++ cpuset_read_lock(); ++ ++ /* ++ * Make sure no PI-waiters arrive (or leave) while we are ++ * changing the priority of the task: ++ */ ++ raw_spin_lock_irqsave(&p->pi_lock, flags); ++ ++ /* ++ * To be able to change p->policy safely, task_access_lock() ++ * must be called. ++ * IF use task_access_lock() here: ++ * For the task p which is not running, reading rq->stop is ++ * racy but acceptable as ->stop doesn't change much. ++ * An enhancemnet can be made to read rq->stop saftly. ++ */ ++ rq = __task_access_lock(p, &lock); ++ ++ /* ++ * Changing the policy of the stop threads its a very bad idea ++ */ ++ if (p == rq->stop) { ++ retval = -EINVAL; ++ goto unlock; ++ } ++ ++ /* ++ * If not changing anything there's no need to proceed further: ++ */ ++ if (unlikely(policy == p->policy)) { ++ if (rt_policy(policy) && attr->sched_priority != p->rt_priority) ++ goto change; ++ if (!rt_policy(policy) && ++ NICE_TO_PRIO(attr->sched_nice) != p->static_prio) ++ goto change; ++ ++ p->sched_reset_on_fork = reset_on_fork; ++ retval = 0; ++ goto unlock; ++ } ++change: ++ ++ /* Re-check policy now with rq lock held */ ++ if (unlikely(oldpolicy != -1 && oldpolicy != p->policy)) { ++ policy = oldpolicy = -1; ++ __task_access_unlock(p, lock); ++ raw_spin_unlock_irqrestore(&p->pi_lock, flags); ++ if (pi) ++ cpuset_read_unlock(); ++ goto recheck; ++ } ++ ++ p->sched_reset_on_fork = reset_on_fork; ++ ++ if (pi) { ++ /* ++ * Take priority boosted tasks into account. If the new ++ * effective priority is unchanged, we just store the new ++ * normal parameters and do not touch the scheduler class and ++ * the runqueue. This will be done when the task deboost ++ * itself. ++ */ ++ if (rt_effective_prio(p, newprio) == p->prio) { ++ __setscheduler_params(p, attr); ++ retval = 0; ++ goto unlock; ++ } ++ } ++ ++ __setscheduler(rq, p, attr, pi); ++ ++ check_task_changed(rq, p); ++ ++ /* Avoid rq from going away on us: */ ++ preempt_disable(); ++ __task_access_unlock(p, lock); ++ raw_spin_unlock_irqrestore(&p->pi_lock, flags); ++ ++ if (pi) { ++ cpuset_read_unlock(); ++ rt_mutex_adjust_pi(p); ++ } ++ ++ preempt_enable(); ++ ++ return 0; ++ ++unlock: ++ __task_access_unlock(p, lock); ++ raw_spin_unlock_irqrestore(&p->pi_lock, flags); ++ if (pi) ++ cpuset_read_unlock(); ++ return retval; ++} ++ ++static int _sched_setscheduler(struct task_struct *p, int policy, ++ const struct sched_param *param, bool check) ++{ ++ struct sched_attr attr = { ++ .sched_policy = policy, ++ .sched_priority = param->sched_priority, ++ .sched_nice = PRIO_TO_NICE(p->static_prio), ++ }; ++ ++ /* Fixup the legacy SCHED_RESET_ON_FORK hack. */ ++ if ((policy != SETPARAM_POLICY) && (policy & SCHED_RESET_ON_FORK)) { ++ attr.sched_flags |= SCHED_FLAG_RESET_ON_FORK; ++ policy &= ~SCHED_RESET_ON_FORK; ++ attr.sched_policy = policy; ++ } ++ ++ return __sched_setscheduler(p, &attr, check, true); ++} ++ ++/** ++ * sched_setscheduler - change the scheduling policy and/or RT priority of a thread. ++ * @p: the task in question. ++ * @policy: new policy. ++ * @param: structure containing the new RT priority. ++ * ++ * Return: 0 on success. An error code otherwise. ++ * ++ * NOTE that the task may be already dead. ++ */ ++int sched_setscheduler(struct task_struct *p, int policy, ++ const struct sched_param *param) ++{ ++ return _sched_setscheduler(p, policy, param, true); ++} ++ ++EXPORT_SYMBOL_GPL(sched_setscheduler); ++ ++int sched_setattr(struct task_struct *p, const struct sched_attr *attr) ++{ ++ return __sched_setscheduler(p, attr, true, true); ++} ++EXPORT_SYMBOL_GPL(sched_setattr); ++ ++int sched_setattr_nocheck(struct task_struct *p, const struct sched_attr *attr) ++{ ++ return __sched_setscheduler(p, attr, false, true); ++} ++ ++/** ++ * sched_setscheduler_nocheck - change the scheduling policy and/or RT priority of a thread from kernelspace. ++ * @p: the task in question. ++ * @policy: new policy. ++ * @param: structure containing the new RT priority. ++ * ++ * Just like sched_setscheduler, only don't bother checking if the ++ * current context has permission. For example, this is needed in ++ * stop_machine(): we create temporary high priority worker threads, ++ * but our caller might not have that capability. ++ * ++ * Return: 0 on success. An error code otherwise. ++ */ ++int sched_setscheduler_nocheck(struct task_struct *p, int policy, ++ const struct sched_param *param) ++{ ++ return _sched_setscheduler(p, policy, param, false); ++} ++EXPORT_SYMBOL_GPL(sched_setscheduler_nocheck); ++ ++static int ++do_sched_setscheduler(pid_t pid, int policy, struct sched_param __user *param) ++{ ++ struct sched_param lparam; ++ struct task_struct *p; ++ int retval; ++ ++ if (!param || pid < 0) ++ return -EINVAL; ++ if (copy_from_user(&lparam, param, sizeof(struct sched_param))) ++ return -EFAULT; ++ ++ rcu_read_lock(); ++ retval = -ESRCH; ++ p = find_process_by_pid(pid); ++ if (likely(p)) ++ get_task_struct(p); ++ rcu_read_unlock(); ++ ++ if (likely(p)) { ++ retval = sched_setscheduler(p, policy, &lparam); ++ put_task_struct(p); ++ } ++ ++ return retval; ++} ++ ++/* ++ * Mimics kernel/events/core.c perf_copy_attr(). ++ */ ++static int sched_copy_attr(struct sched_attr __user *uattr, struct sched_attr *attr) ++{ ++ u32 size; ++ int ret; ++ ++ /* Zero the full structure, so that a short copy will be nice: */ ++ memset(attr, 0, sizeof(*attr)); ++ ++ ret = get_user(size, &uattr->size); ++ if (ret) ++ return ret; ++ ++ /* ABI compatibility quirk: */ ++ if (!size) ++ size = SCHED_ATTR_SIZE_VER0; ++ ++ if (size < SCHED_ATTR_SIZE_VER0 || size > PAGE_SIZE) ++ goto err_size; ++ ++ ret = copy_struct_from_user(attr, sizeof(*attr), uattr, size); ++ if (ret) { ++ if (ret == -E2BIG) ++ goto err_size; ++ return ret; ++ } ++ ++ /* ++ * XXX: Do we want to be lenient like existing syscalls; or do we want ++ * to be strict and return an error on out-of-bounds values? ++ */ ++ attr->sched_nice = clamp(attr->sched_nice, -20, 19); ++ ++ /* sched/core.c uses zero here but we already know ret is zero */ ++ return 0; ++ ++err_size: ++ put_user(sizeof(*attr), &uattr->size); ++ return -E2BIG; ++} ++ ++/** ++ * sys_sched_setscheduler - set/change the scheduler policy and RT priority ++ * @pid: the pid in question. ++ * @policy: new policy. ++ * ++ * Return: 0 on success. An error code otherwise. ++ * @param: structure containing the new RT priority. ++ */ ++SYSCALL_DEFINE3(sched_setscheduler, pid_t, pid, int, policy, struct sched_param __user *, param) ++{ ++ if (policy < 0) ++ return -EINVAL; ++ ++ return do_sched_setscheduler(pid, policy, param); ++} ++ ++/** ++ * sys_sched_setparam - set/change the RT priority of a thread ++ * @pid: the pid in question. ++ * @param: structure containing the new RT priority. ++ * ++ * Return: 0 on success. An error code otherwise. ++ */ ++SYSCALL_DEFINE2(sched_setparam, pid_t, pid, struct sched_param __user *, param) ++{ ++ return do_sched_setscheduler(pid, SETPARAM_POLICY, param); ++} ++ ++/** ++ * sys_sched_setattr - same as above, but with extended sched_attr ++ * @pid: the pid in question. ++ * @uattr: structure containing the extended parameters. ++ */ ++SYSCALL_DEFINE3(sched_setattr, pid_t, pid, struct sched_attr __user *, uattr, ++ unsigned int, flags) ++{ ++ struct sched_attr attr; ++ struct task_struct *p; ++ int retval; ++ ++ if (!uattr || pid < 0 || flags) ++ return -EINVAL; ++ ++ retval = sched_copy_attr(uattr, &attr); ++ if (retval) ++ return retval; ++ ++ if ((int)attr.sched_policy < 0) ++ return -EINVAL; ++ ++ rcu_read_lock(); ++ retval = -ESRCH; ++ p = find_process_by_pid(pid); ++ if (p != NULL) ++ retval = sched_setattr(p, &attr); ++ rcu_read_unlock(); ++ ++ return retval; ++} ++ ++/** ++ * sys_sched_getscheduler - get the policy (scheduling class) of a thread ++ * @pid: the pid in question. ++ * ++ * Return: On success, the policy of the thread. Otherwise, a negative error ++ * code. ++ */ ++SYSCALL_DEFINE1(sched_getscheduler, pid_t, pid) ++{ ++ struct task_struct *p; ++ int retval = -EINVAL; ++ ++ if (pid < 0) ++ goto out_nounlock; ++ ++ retval = -ESRCH; ++ rcu_read_lock(); ++ p = find_process_by_pid(pid); ++ if (p) { ++ retval = security_task_getscheduler(p); ++ if (!retval) ++ retval = p->policy; ++ } ++ rcu_read_unlock(); ++ ++out_nounlock: ++ return retval; ++} ++ ++/** ++ * sys_sched_getscheduler - get the RT priority of a thread ++ * @pid: the pid in question. ++ * @param: structure containing the RT priority. ++ * ++ * Return: On success, 0 and the RT priority is in @param. Otherwise, an error ++ * code. ++ */ ++SYSCALL_DEFINE2(sched_getparam, pid_t, pid, struct sched_param __user *, param) ++{ ++ struct sched_param lp = { .sched_priority = 0 }; ++ struct task_struct *p; ++ int retval = -EINVAL; ++ ++ if (!param || pid < 0) ++ goto out_nounlock; ++ ++ rcu_read_lock(); ++ p = find_process_by_pid(pid); ++ retval = -ESRCH; ++ if (!p) ++ goto out_unlock; ++ ++ retval = security_task_getscheduler(p); ++ if (retval) ++ goto out_unlock; ++ ++ if (task_has_rt_policy(p)) ++ lp.sched_priority = p->rt_priority; ++ rcu_read_unlock(); ++ ++ /* ++ * This one might sleep, we cannot do it with a spinlock held ... ++ */ ++ retval = copy_to_user(param, &lp, sizeof(*param)) ? -EFAULT : 0; ++ ++out_nounlock: ++ return retval; ++ ++out_unlock: ++ rcu_read_unlock(); ++ return retval; ++} ++ ++/* ++ * Copy the kernel size attribute structure (which might be larger ++ * than what user-space knows about) to user-space. ++ * ++ * Note that all cases are valid: user-space buffer can be larger or ++ * smaller than the kernel-space buffer. The usual case is that both ++ * have the same size. ++ */ ++static int ++sched_attr_copy_to_user(struct sched_attr __user *uattr, ++ struct sched_attr *kattr, ++ unsigned int usize) ++{ ++ unsigned int ksize = sizeof(*kattr); ++ ++ if (!access_ok(uattr, usize)) ++ return -EFAULT; ++ ++ /* ++ * sched_getattr() ABI forwards and backwards compatibility: ++ * ++ * If usize == ksize then we just copy everything to user-space and all is good. ++ * ++ * If usize < ksize then we only copy as much as user-space has space for, ++ * this keeps ABI compatibility as well. We skip the rest. ++ * ++ * If usize > ksize then user-space is using a newer version of the ABI, ++ * which part the kernel doesn't know about. Just ignore it - tooling can ++ * detect the kernel's knowledge of attributes from the attr->size value ++ * which is set to ksize in this case. ++ */ ++ kattr->size = min(usize, ksize); ++ ++ if (copy_to_user(uattr, kattr, kattr->size)) ++ return -EFAULT; ++ ++ return 0; ++} ++ ++/** ++ * sys_sched_getattr - similar to sched_getparam, but with sched_attr ++ * @pid: the pid in question. ++ * @uattr: structure containing the extended parameters. ++ * @usize: sizeof(attr) for fwd/bwd comp. ++ * @flags: for future extension. ++ */ ++SYSCALL_DEFINE4(sched_getattr, pid_t, pid, struct sched_attr __user *, uattr, ++ unsigned int, usize, unsigned int, flags) ++{ ++ struct sched_attr kattr = { }; ++ struct task_struct *p; ++ int retval; ++ ++ if (!uattr || pid < 0 || usize > PAGE_SIZE || ++ usize < SCHED_ATTR_SIZE_VER0 || flags) ++ return -EINVAL; ++ ++ rcu_read_lock(); ++ p = find_process_by_pid(pid); ++ retval = -ESRCH; ++ if (!p) ++ goto out_unlock; ++ ++ retval = security_task_getscheduler(p); ++ if (retval) ++ goto out_unlock; ++ ++ kattr.sched_policy = p->policy; ++ if (p->sched_reset_on_fork) ++ kattr.sched_flags |= SCHED_FLAG_RESET_ON_FORK; ++ if (task_has_rt_policy(p)) ++ kattr.sched_priority = p->rt_priority; ++ else ++ kattr.sched_nice = task_nice(p); ++ ++#ifdef CONFIG_UCLAMP_TASK ++ kattr.sched_util_min = p->uclamp_req[UCLAMP_MIN].value; ++ kattr.sched_util_max = p->uclamp_req[UCLAMP_MAX].value; ++#endif ++ ++ rcu_read_unlock(); ++ ++ return sched_attr_copy_to_user(uattr, &kattr, usize); ++ ++out_unlock: ++ rcu_read_unlock(); ++ return retval; ++} ++ ++long sched_setaffinity(pid_t pid, const struct cpumask *in_mask) ++{ ++ cpumask_var_t cpus_allowed, new_mask; ++ struct task_struct *p; ++ int retval; ++ ++ get_online_cpus(); ++ rcu_read_lock(); ++ ++ p = find_process_by_pid(pid); ++ if (!p) { ++ rcu_read_unlock(); ++ put_online_cpus(); ++ return -ESRCH; ++ } ++ ++ /* Prevent p going away */ ++ get_task_struct(p); ++ rcu_read_unlock(); ++ ++ if (p->flags & PF_NO_SETAFFINITY) { ++ retval = -EINVAL; ++ goto out_put_task; ++ } ++ if (!alloc_cpumask_var(&cpus_allowed, GFP_KERNEL)) { ++ retval = -ENOMEM; ++ goto out_put_task; ++ } ++ if (!alloc_cpumask_var(&new_mask, GFP_KERNEL)) { ++ retval = -ENOMEM; ++ goto out_free_cpus_allowed; ++ } ++ retval = -EPERM; ++ if (!check_same_owner(p)) { ++ rcu_read_lock(); ++ if (!ns_capable(__task_cred(p)->user_ns, CAP_SYS_NICE)) { ++ rcu_read_unlock(); ++ goto out_unlock; ++ } ++ rcu_read_unlock(); ++ } ++ ++ retval = security_task_setscheduler(p); ++ if (retval) ++ goto out_unlock; ++ ++ cpuset_cpus_allowed(p, cpus_allowed); ++ cpumask_and(new_mask, in_mask, cpus_allowed); ++again: ++ retval = __set_cpus_allowed_ptr(p, new_mask, true); ++ ++ if (!retval) { ++ cpuset_cpus_allowed(p, cpus_allowed); ++ if (!cpumask_subset(new_mask, cpus_allowed)) { ++ /* ++ * We must have raced with a concurrent cpuset ++ * update. Just reset the cpus_allowed to the ++ * cpuset's cpus_allowed ++ */ ++ cpumask_copy(new_mask, cpus_allowed); ++ goto again; ++ } ++ } ++out_unlock: ++ free_cpumask_var(new_mask); ++out_free_cpus_allowed: ++ free_cpumask_var(cpus_allowed); ++out_put_task: ++ put_task_struct(p); ++ put_online_cpus(); ++ return retval; ++} ++ ++static int get_user_cpu_mask(unsigned long __user *user_mask_ptr, unsigned len, ++ struct cpumask *new_mask) ++{ ++ if (len < cpumask_size()) ++ cpumask_clear(new_mask); ++ else if (len > cpumask_size()) ++ len = cpumask_size(); ++ ++ return copy_from_user(new_mask, user_mask_ptr, len) ? -EFAULT : 0; ++} ++ ++/** ++ * sys_sched_setaffinity - set the CPU affinity of a process ++ * @pid: pid of the process ++ * @len: length in bytes of the bitmask pointed to by user_mask_ptr ++ * @user_mask_ptr: user-space pointer to the new CPU mask ++ * ++ * Return: 0 on success. An error code otherwise. ++ */ ++SYSCALL_DEFINE3(sched_setaffinity, pid_t, pid, unsigned int, len, ++ unsigned long __user *, user_mask_ptr) ++{ ++ cpumask_var_t new_mask; ++ int retval; ++ ++ if (!alloc_cpumask_var(&new_mask, GFP_KERNEL)) ++ return -ENOMEM; ++ ++ retval = get_user_cpu_mask(user_mask_ptr, len, new_mask); ++ if (retval == 0) ++ retval = sched_setaffinity(pid, new_mask); ++ free_cpumask_var(new_mask); ++ return retval; ++} ++ ++long sched_getaffinity(pid_t pid, cpumask_t *mask) ++{ ++ struct task_struct *p; ++ raw_spinlock_t *lock; ++ unsigned long flags; ++ int retval; ++ ++ rcu_read_lock(); ++ ++ retval = -ESRCH; ++ p = find_process_by_pid(pid); ++ if (!p) ++ goto out_unlock; ++ ++ retval = security_task_getscheduler(p); ++ if (retval) ++ goto out_unlock; ++ ++ task_access_lock_irqsave(p, &lock, &flags); ++ cpumask_and(mask, &p->cpus_mask, cpu_active_mask); ++ task_access_unlock_irqrestore(p, lock, &flags); ++ ++out_unlock: ++ rcu_read_unlock(); ++ ++ return retval; ++} ++ ++/** ++ * sys_sched_getaffinity - get the CPU affinity of a process ++ * @pid: pid of the process ++ * @len: length in bytes of the bitmask pointed to by user_mask_ptr ++ * @user_mask_ptr: user-space pointer to hold the current CPU mask ++ * ++ * Return: size of CPU mask copied to user_mask_ptr on success. An ++ * error code otherwise. ++ */ ++SYSCALL_DEFINE3(sched_getaffinity, pid_t, pid, unsigned int, len, ++ unsigned long __user *, user_mask_ptr) ++{ ++ int ret; ++ cpumask_var_t mask; ++ ++ if ((len * BITS_PER_BYTE) < nr_cpu_ids) ++ return -EINVAL; ++ if (len & (sizeof(unsigned long)-1)) ++ return -EINVAL; ++ ++ if (!alloc_cpumask_var(&mask, GFP_KERNEL)) ++ return -ENOMEM; ++ ++ ret = sched_getaffinity(pid, mask); ++ if (ret == 0) { ++ unsigned int retlen = min_t(size_t, len, cpumask_size()); ++ ++ if (copy_to_user(user_mask_ptr, mask, retlen)) ++ ret = -EFAULT; ++ else ++ ret = retlen; ++ } ++ free_cpumask_var(mask); ++ ++ return ret; ++} ++ ++/** ++ * sys_sched_yield - yield the current processor to other threads. ++ * ++ * This function yields the current CPU to other tasks. It does this by ++ * scheduling away the current task. If it still has the earliest deadline ++ * it will be scheduled again as the next task. ++ * ++ * Return: 0. ++ */ ++static void do_sched_yield(void) ++{ ++ struct rq *rq; ++ struct rq_flags rf; ++ ++ if (!sched_yield_type) ++ return; ++ ++ rq = this_rq_lock_irq(&rf); ++ ++ schedstat_inc(rq->yld_count); ++ ++ if (1 == sched_yield_type) { ++ if (!rt_task(current)) { ++ current->boost_prio = MAX_PRIORITY_ADJ; ++ requeue_task(current, rq); ++ } ++ } else if (2 == sched_yield_type) { ++ if (rq->nr_running > 1) ++ rq->skip = current; ++ } ++ ++ /* ++ * Since we are going to call schedule() anyway, there's ++ * no need to preempt or enable interrupts: ++ */ ++ preempt_disable(); ++ raw_spin_unlock(&rq->lock); ++ sched_preempt_enable_no_resched(); ++ ++ schedule(); ++} ++ ++SYSCALL_DEFINE0(sched_yield) ++{ ++ do_sched_yield(); ++ return 0; ++} ++ ++#ifndef CONFIG_PREEMPTION ++int __sched _cond_resched(void) ++{ ++ if (should_resched(0)) { ++ preempt_schedule_common(); ++ return 1; ++ } ++ rcu_all_qs(); ++ return 0; ++} ++EXPORT_SYMBOL(_cond_resched); ++#endif ++ ++/* ++ * __cond_resched_lock() - if a reschedule is pending, drop the given lock, ++ * call schedule, and on return reacquire the lock. ++ * ++ * This works OK both with and without CONFIG_PREEMPTION. We do strange low-level ++ * operations here to prevent schedule() from being called twice (once via ++ * spin_unlock(), once by hand). ++ */ ++int __cond_resched_lock(spinlock_t *lock) ++{ ++ int resched = should_resched(PREEMPT_LOCK_OFFSET); ++ int ret = 0; ++ ++ lockdep_assert_held(lock); ++ ++ if (spin_needbreak(lock) || resched) { ++ spin_unlock(lock); ++ if (resched) ++ preempt_schedule_common(); ++ else ++ cpu_relax(); ++ ret = 1; ++ spin_lock(lock); ++ } ++ return ret; ++} ++EXPORT_SYMBOL(__cond_resched_lock); ++ ++/** ++ * yield - yield the current processor to other threads. ++ * ++ * Do not ever use this function, there's a 99% chance you're doing it wrong. ++ * ++ * The scheduler is at all times free to pick the calling task as the most ++ * eligible task to run, if removing the yield() call from your code breaks ++ * it, its already broken. ++ * ++ * Typical broken usage is: ++ * ++ * while (!event) ++ * yield(); ++ * ++ * where one assumes that yield() will let 'the other' process run that will ++ * make event true. If the current task is a SCHED_FIFO task that will never ++ * happen. Never use yield() as a progress guarantee!! ++ * ++ * If you want to use yield() to wait for something, use wait_event(). ++ * If you want to use yield() to be 'nice' for others, use cond_resched(). ++ * If you still want to use yield(), do not! ++ */ ++void __sched yield(void) ++{ ++ set_current_state(TASK_RUNNING); ++ do_sched_yield(); ++} ++EXPORT_SYMBOL(yield); ++ ++/** ++ * yield_to - yield the current processor to another thread in ++ * your thread group, or accelerate that thread toward the ++ * processor it's on. ++ * @p: target task ++ * @preempt: whether task preemption is allowed or not ++ * ++ * It's the caller's job to ensure that the target task struct ++ * can't go away on us before we can do any checks. ++ * ++ * In BMQ, yield_to is not supported. ++ * ++ * Return: ++ * true (>0) if we indeed boosted the target task. ++ * false (0) if we failed to boost the target. ++ * -ESRCH if there's no task to yield to. ++ */ ++int __sched yield_to(struct task_struct *p, bool preempt) ++{ ++ return 0; ++} ++EXPORT_SYMBOL_GPL(yield_to); ++ ++int io_schedule_prepare(void) ++{ ++ int old_iowait = current->in_iowait; ++ ++ current->in_iowait = 1; ++ blk_schedule_flush_plug(current); ++ ++ return old_iowait; ++} ++ ++void io_schedule_finish(int token) ++{ ++ current->in_iowait = token; ++} ++ ++/* ++ * This task is about to go to sleep on IO. Increment rq->nr_iowait so ++ * that process accounting knows that this is a task in IO wait state. ++ * ++ * But don't do that if it is a deliberate, throttling IO wait (this task ++ * has set its backing_dev_info: the queue against which it should throttle) ++ */ ++ ++long __sched io_schedule_timeout(long timeout) ++{ ++ int token; ++ long ret; ++ ++ token = io_schedule_prepare(); ++ ret = schedule_timeout(timeout); ++ io_schedule_finish(token); ++ ++ return ret; ++} ++EXPORT_SYMBOL(io_schedule_timeout); ++ ++void __sched io_schedule(void) ++{ ++ int token; ++ ++ token = io_schedule_prepare(); ++ schedule(); ++ io_schedule_finish(token); ++} ++EXPORT_SYMBOL(io_schedule); ++ ++/** ++ * sys_sched_get_priority_max - return maximum RT priority. ++ * @policy: scheduling class. ++ * ++ * Return: On success, this syscall returns the maximum ++ * rt_priority that can be used by a given scheduling class. ++ * On failure, a negative error code is returned. ++ */ ++SYSCALL_DEFINE1(sched_get_priority_max, int, policy) ++{ ++ int ret = -EINVAL; ++ ++ switch (policy) { ++ case SCHED_FIFO: ++ case SCHED_RR: ++ ret = MAX_USER_RT_PRIO-1; ++ break; ++ case SCHED_NORMAL: ++ case SCHED_BATCH: ++ case SCHED_IDLE: ++ ret = 0; ++ break; ++ } ++ return ret; ++} ++ ++/** ++ * sys_sched_get_priority_min - return minimum RT priority. ++ * @policy: scheduling class. ++ * ++ * Return: On success, this syscall returns the minimum ++ * rt_priority that can be used by a given scheduling class. ++ * On failure, a negative error code is returned. ++ */ ++SYSCALL_DEFINE1(sched_get_priority_min, int, policy) ++{ ++ int ret = -EINVAL; ++ ++ switch (policy) { ++ case SCHED_FIFO: ++ case SCHED_RR: ++ ret = 1; ++ break; ++ case SCHED_NORMAL: ++ case SCHED_BATCH: ++ case SCHED_IDLE: ++ ret = 0; ++ break; ++ } ++ return ret; ++} ++ ++static int sched_rr_get_interval(pid_t pid, struct timespec64 *t) ++{ ++ struct task_struct *p; ++ int retval; ++ ++ if (pid < 0) ++ return -EINVAL; ++ ++ retval = -ESRCH; ++ rcu_read_lock(); ++ p = find_process_by_pid(pid); ++ if (!p) ++ goto out_unlock; ++ ++ retval = security_task_getscheduler(p); ++ if (retval) ++ goto out_unlock; ++ rcu_read_unlock(); ++ ++ *t = ns_to_timespec64(SCHED_TIMESLICE_NS); ++ return 0; ++ ++out_unlock: ++ rcu_read_unlock(); ++ return retval; ++} ++ ++/** ++ * sys_sched_rr_get_interval - return the default timeslice of a process. ++ * @pid: pid of the process. ++ * @interval: userspace pointer to the timeslice value. ++ * ++ * ++ * Return: On success, 0 and the timeslice is in @interval. Otherwise, ++ * an error code. ++ */ ++SYSCALL_DEFINE2(sched_rr_get_interval, pid_t, pid, ++ struct __kernel_timespec __user *, interval) ++{ ++ struct timespec64 t; ++ int retval = sched_rr_get_interval(pid, &t); ++ ++ if (retval == 0) ++ retval = put_timespec64(&t, interval); ++ ++ return retval; ++} ++ ++#ifdef CONFIG_COMPAT_32BIT_TIME ++SYSCALL_DEFINE2(sched_rr_get_interval_time32, pid_t, pid, ++ struct old_timespec32 __user *, interval) ++{ ++ struct timespec64 t; ++ int retval = sched_rr_get_interval(pid, &t); ++ ++ if (retval == 0) ++ retval = put_old_timespec32(&t, interval); ++ return retval; ++} ++#endif ++ ++void sched_show_task(struct task_struct *p) ++{ ++ unsigned long free = 0; ++ int ppid; ++ ++ if (!try_get_task_stack(p)) ++ return; ++ ++ printk(KERN_INFO "%-15.15s %c", p->comm, task_state_to_char(p)); ++ ++ if (p->state == TASK_RUNNING) ++ printk(KERN_CONT " running task "); ++#ifdef CONFIG_DEBUG_STACK_USAGE ++ free = stack_not_used(p); ++#endif ++ ppid = 0; ++ rcu_read_lock(); ++ if (pid_alive(p)) ++ ppid = task_pid_nr(rcu_dereference(p->real_parent)); ++ rcu_read_unlock(); ++ printk(KERN_CONT "%5lu %5d %6d 0x%08lx\n", free, ++ task_pid_nr(p), ppid, ++ (unsigned long)task_thread_info(p)->flags); ++ ++ print_worker_info(KERN_INFO, p); ++ show_stack(p, NULL); ++ put_task_stack(p); ++} ++EXPORT_SYMBOL_GPL(sched_show_task); ++ ++static inline bool ++state_filter_match(unsigned long state_filter, struct task_struct *p) ++{ ++ /* no filter, everything matches */ ++ if (!state_filter) ++ return true; ++ ++ /* filter, but doesn't match */ ++ if (!(p->state & state_filter)) ++ return false; ++ ++ /* ++ * When looking for TASK_UNINTERRUPTIBLE skip TASK_IDLE (allows ++ * TASK_KILLABLE). ++ */ ++ if (state_filter == TASK_UNINTERRUPTIBLE && p->state == TASK_IDLE) ++ return false; ++ ++ return true; ++} ++ ++ ++void show_state_filter(unsigned long state_filter) ++{ ++ struct task_struct *g, *p; ++ ++#if BITS_PER_LONG == 32 ++ printk(KERN_INFO ++ " task PC stack pid father\n"); ++#else ++ printk(KERN_INFO ++ " task PC stack pid father\n"); ++#endif ++ rcu_read_lock(); ++ for_each_process_thread(g, p) { ++ /* ++ * reset the NMI-timeout, listing all files on a slow ++ * console might take a lot of time: ++ * Also, reset softlockup watchdogs on all CPUs, because ++ * another CPU might be blocked waiting for us to process ++ * an IPI. ++ */ ++ touch_nmi_watchdog(); ++ touch_all_softlockup_watchdogs(); ++ if (state_filter_match(state_filter, p)) ++ sched_show_task(p); ++ } ++ ++#ifdef CONFIG_SCHED_DEBUG ++ /* TODO: BMQ should support this ++ if (!state_filter) ++ sysrq_sched_debug_show(); ++ */ ++#endif ++ rcu_read_unlock(); ++ /* ++ * Only show locks if all tasks are dumped: ++ */ ++ if (!state_filter) ++ debug_show_all_locks(); ++} ++ ++void dump_cpu_task(int cpu) ++{ ++ pr_info("Task dump for CPU %d:\n", cpu); ++ sched_show_task(cpu_curr(cpu)); ++} ++ ++/** ++ * init_idle - set up an idle thread for a given CPU ++ * @idle: task in question ++ * @cpu: cpu the idle task belongs to ++ * ++ * NOTE: this function does not set the idle thread's NEED_RESCHED ++ * flag, to make booting more robust. ++ */ ++void init_idle(struct task_struct *idle, int cpu) ++{ ++ struct rq *rq = cpu_rq(cpu); ++ unsigned long flags; ++ ++ raw_spin_lock_irqsave(&idle->pi_lock, flags); ++ raw_spin_lock(&rq->lock); ++ update_rq_clock(rq); ++ ++ idle->last_ran = rq->clock_task; ++ idle->state = TASK_RUNNING; ++ idle->flags |= PF_IDLE; ++ /* Setting prio to illegal value shouldn't matter when never queued */ ++ idle->prio = MAX_PRIO; ++ ++ idle->bmq_idx = IDLE_TASK_SCHED_PRIO; ++ bmq_init_idle(&rq->queue, idle); ++ ++ kasan_unpoison_task_stack(idle); ++ ++#ifdef CONFIG_SMP ++ /* ++ * It's possible that init_idle() gets called multiple times on a task, ++ * in that case do_set_cpus_allowed() will not do the right thing. ++ * ++ * And since this is boot we can forgo the serialisation. ++ */ ++ set_cpus_allowed_common(idle, cpumask_of(cpu)); ++#endif ++ ++ /* Silence PROVE_RCU */ ++ rcu_read_lock(); ++ __set_task_cpu(idle, cpu); ++ rcu_read_unlock(); ++ ++ rq->idle = idle; ++ rcu_assign_pointer(rq->curr, idle); ++ idle->on_cpu = 1; ++ ++ raw_spin_unlock(&rq->lock); ++ raw_spin_unlock_irqrestore(&idle->pi_lock, flags); ++ ++ /* Set the preempt count _outside_ the spinlocks! */ ++ init_idle_preempt_count(idle, cpu); ++ ++ ftrace_graph_init_idle_task(idle, cpu); ++ vtime_init_idle(idle, cpu); ++#ifdef CONFIG_SMP ++ sprintf(idle->comm, "%s/%d", INIT_TASK_COMM, cpu); ++#endif ++} ++ ++static bool __wake_q_add(struct wake_q_head *head, struct task_struct *task) ++{ ++ struct wake_q_node *node = &task->wake_q; ++ ++ /* ++ * Atomically grab the task, if ->wake_q is !nil already it means ++ * its already queued (either by us or someone else) and will get the ++ * wakeup due to that. ++ * ++ * In order to ensure that a pending wakeup will observe our pending ++ * state, even in the failed case, an explicit smp_mb() must be used. ++ */ ++ smp_mb__before_atomic(); ++ if (unlikely(cmpxchg_relaxed(&node->next, NULL, WAKE_Q_TAIL))) ++ return false; ++ ++ /* ++ * The head is context local, there can be no concurrency. ++ */ ++ *head->lastp = node; ++ head->lastp = &node->next; ++ return true; ++} ++ ++/** ++ * wake_q_add() - queue a wakeup for 'later' waking. ++ * @head: the wake_q_head to add @task to ++ * @task: the task to queue for 'later' wakeup ++ * ++ * Queue a task for later wakeup, most likely by the wake_up_q() call in the ++ * same context, _HOWEVER_ this is not guaranteed, the wakeup can come ++ * instantly. ++ * ++ * This function must be used as-if it were wake_up_process(); IOW the task ++ * must be ready to be woken at this location. ++ */ ++void wake_q_add(struct wake_q_head *head, struct task_struct *task) ++{ ++ if (__wake_q_add(head, task)) ++ get_task_struct(task); ++} ++ ++/** ++ * wake_q_add_safe() - safely queue a wakeup for 'later' waking. ++ * @head: the wake_q_head to add @task to ++ * @task: the task to queue for 'later' wakeup ++ * ++ * Queue a task for later wakeup, most likely by the wake_up_q() call in the ++ * same context, _HOWEVER_ this is not guaranteed, the wakeup can come ++ * instantly. ++ * ++ * This function must be used as-if it were wake_up_process(); IOW the task ++ * must be ready to be woken at this location. ++ * ++ * This function is essentially a task-safe equivalent to wake_q_add(). Callers ++ * that already hold reference to @task can call the 'safe' version and trust ++ * wake_q to do the right thing depending whether or not the @task is already ++ * queued for wakeup. ++ */ ++void wake_q_add_safe(struct wake_q_head *head, struct task_struct *task) ++{ ++ if (!__wake_q_add(head, task)) ++ put_task_struct(task); ++} ++ ++void wake_up_q(struct wake_q_head *head) ++{ ++ struct wake_q_node *node = head->first; ++ ++ while (node != WAKE_Q_TAIL) { ++ struct task_struct *task; ++ ++ task = container_of(node, struct task_struct, wake_q); ++ BUG_ON(!task); ++ /* task can safely be re-inserted now: */ ++ node = node->next; ++ task->wake_q.next = NULL; ++ ++ /* ++ * wake_up_process() executes a full barrier, which pairs with ++ * the queueing in wake_q_add() so as not to miss wakeups. ++ */ ++ wake_up_process(task); ++ put_task_struct(task); ++ } ++} ++ ++#ifdef CONFIG_SMP ++ ++int cpuset_cpumask_can_shrink(const struct cpumask __maybe_unused *cur, ++ const struct cpumask __maybe_unused *trial) ++{ ++ return 1; ++} ++ ++int task_can_attach(struct task_struct *p, ++ const struct cpumask *cs_cpus_allowed) ++{ ++ int ret = 0; ++ ++ /* ++ * Kthreads which disallow setaffinity shouldn't be moved ++ * to a new cpuset; we don't want to change their CPU ++ * affinity and isolating such threads by their set of ++ * allowed nodes is unnecessary. Thus, cpusets are not ++ * applicable for such threads. This prevents checking for ++ * success of set_cpus_allowed_ptr() on all attached tasks ++ * before cpus_mask may be changed. ++ */ ++ if (p->flags & PF_NO_SETAFFINITY) ++ ret = -EINVAL; ++ ++ return ret; ++} ++ ++static bool sched_smp_initialized __read_mostly; ++ ++#ifdef CONFIG_NO_HZ_COMMON ++void nohz_balance_enter_idle(int cpu) ++{ ++} ++ ++void select_nohz_load_balancer(int stop_tick) ++{ ++} ++ ++void set_cpu_sd_state_idle(void) {} ++ ++/* ++ * In the semi idle case, use the nearest busy CPU for migrating timers ++ * from an idle CPU. This is good for power-savings. ++ * ++ * We don't do similar optimization for completely idle system, as ++ * selecting an idle CPU will add more delays to the timers than intended ++ * (as that CPU's timer base may not be uptodate wrt jiffies etc). ++ */ ++int get_nohz_timer_target(void) ++{ ++ int i, cpu = smp_processor_id(); ++ struct cpumask *mask; ++ ++ if (!idle_cpu(cpu) && housekeeping_cpu(cpu, HK_FLAG_TIMER)) ++ return cpu; ++ ++ for (mask = &(per_cpu(sched_cpu_affinity_masks, cpu)[0]); ++ mask < per_cpu(sched_cpu_affinity_end_mask, cpu); mask++) ++ for_each_cpu(i, mask) ++ if (!idle_cpu(i) && housekeeping_cpu(i, HK_FLAG_TIMER)) ++ return i; ++ ++ if (!housekeeping_cpu(cpu, HK_FLAG_TIMER)) ++ cpu = housekeeping_any_cpu(HK_FLAG_TIMER); ++ ++ return cpu; ++} ++ ++/* ++ * When add_timer_on() enqueues a timer into the timer wheel of an ++ * idle CPU then this timer might expire before the next timer event ++ * which is scheduled to wake up that CPU. In case of a completely ++ * idle system the next event might even be infinite time into the ++ * future. wake_up_idle_cpu() ensures that the CPU is woken up and ++ * leaves the inner idle loop so the newly added timer is taken into ++ * account when the CPU goes back to idle and evaluates the timer ++ * wheel for the next timer event. ++ */ ++void wake_up_idle_cpu(int cpu) ++{ ++ if (cpu == smp_processor_id()) ++ return; ++ ++ set_tsk_need_resched(cpu_rq(cpu)->idle); ++ smp_send_reschedule(cpu); ++} ++ ++void wake_up_nohz_cpu(int cpu) ++{ ++ wake_up_idle_cpu(cpu); ++} ++#endif /* CONFIG_NO_HZ_COMMON */ ++ ++#ifdef CONFIG_HOTPLUG_CPU ++/* ++ * Ensures that the idle task is using init_mm right before its CPU goes ++ * offline. ++ */ ++void idle_task_exit(void) ++{ ++ struct mm_struct *mm = current->active_mm; ++ ++ BUG_ON(cpu_online(smp_processor_id())); ++ ++ if (mm != &init_mm) { ++ switch_mm(mm, &init_mm, current); ++ current->active_mm = &init_mm; ++ finish_arch_post_lock_switch(); ++ } ++ mmdrop(mm); ++} ++ ++/* ++ * Migrate all tasks from the rq, sleeping tasks will be migrated by ++ * try_to_wake_up()->select_task_rq(). ++ * ++ * Called with rq->lock held even though we'er in stop_machine() and ++ * there's no concurrency possible, we hold the required locks anyway ++ * because of lock validation efforts. ++ */ ++static void migrate_tasks(struct rq *dead_rq) ++{ ++ struct rq *rq = dead_rq; ++ struct task_struct *p, *stop = rq->stop; ++ int count = 0; ++ ++ /* ++ * Fudge the rq selection such that the below task selection loop ++ * doesn't get stuck on the currently eligible stop task. ++ * ++ * We're currently inside stop_machine() and the rq is either stuck ++ * in the stop_machine_cpu_stop() loop, or we're executing this code, ++ * either way we should never end up calling schedule() until we're ++ * done here. ++ */ ++ rq->stop = NULL; ++ ++ p = rq_first_bmq_task(rq); ++ while (p != rq->idle) { ++ int dest_cpu; ++ ++ /* skip the running task */ ++ if (task_running(p) || 1 == p->nr_cpus_allowed) { ++ p = rq_next_bmq_task(p, rq); ++ continue; ++ } ++ ++ /* ++ * Rules for changing task_struct::cpus_allowed are holding ++ * both pi_lock and rq->lock, such that holding either ++ * stabilizes the mask. ++ * ++ * Drop rq->lock is not quite as disastrous as it usually is ++ * because !cpu_active at this point, which means load-balance ++ * will not interfere. Also, stop-machine. ++ */ ++ raw_spin_unlock(&rq->lock); ++ raw_spin_lock(&p->pi_lock); ++ raw_spin_lock(&rq->lock); ++ ++ /* ++ * Since we're inside stop-machine, _nothing_ should have ++ * changed the task, WARN if weird stuff happened, because in ++ * that case the above rq->lock drop is a fail too. ++ */ ++ if (WARN_ON(task_rq(p) != rq || !task_on_rq_queued(p))) { ++ raw_spin_unlock(&p->pi_lock); ++ p = rq_next_bmq_task(p, rq); ++ continue; ++ } ++ ++ count++; ++ /* Find suitable destination for @next, with force if needed. */ ++ dest_cpu = select_fallback_rq(dead_rq->cpu, p); ++ rq = __migrate_task(rq, p, dest_cpu); ++ raw_spin_unlock(&rq->lock); ++ raw_spin_unlock(&p->pi_lock); ++ ++ rq = dead_rq; ++ raw_spin_lock(&rq->lock); ++ /* Check queued task all over from the header again */ ++ p = rq_first_bmq_task(rq); ++ } ++ ++ rq->stop = stop; ++} ++ ++static void set_rq_offline(struct rq *rq) ++{ ++ if (rq->online) ++ rq->online = false; ++} ++#endif /* CONFIG_HOTPLUG_CPU */ ++ ++static void set_rq_online(struct rq *rq) ++{ ++ if (!rq->online) ++ rq->online = true; ++} ++ ++#ifdef CONFIG_SCHED_DEBUG ++ ++static __read_mostly int sched_debug_enabled; ++ ++static int __init sched_debug_setup(char *str) ++{ ++ sched_debug_enabled = 1; ++ ++ return 0; ++} ++early_param("sched_debug", sched_debug_setup); ++ ++static inline bool sched_debug(void) ++{ ++ return sched_debug_enabled; ++} ++#else /* !CONFIG_SCHED_DEBUG */ ++static inline bool sched_debug(void) ++{ ++ return false; ++} ++#endif /* CONFIG_SCHED_DEBUG */ ++ ++#ifdef CONFIG_SMP ++void scheduler_ipi(void) ++{ ++ /* ++ * Fold TIF_NEED_RESCHED into the preempt_count; anybody setting ++ * TIF_NEED_RESCHED remotely (for the first time) will also send ++ * this IPI. ++ */ ++ preempt_fold_need_resched(); ++ ++ if (!idle_cpu(smp_processor_id()) || need_resched()) ++ return; ++ ++ irq_enter(); ++ irq_exit(); ++} ++ ++void wake_up_if_idle(int cpu) ++{ ++ struct rq *rq = cpu_rq(cpu); ++ unsigned long flags; ++ ++ rcu_read_lock(); ++ ++ if (!is_idle_task(rcu_dereference(rq->curr))) ++ goto out; ++ ++ if (set_nr_if_polling(rq->idle)) { ++ trace_sched_wake_idle_without_ipi(cpu); ++ } else { ++ raw_spin_lock_irqsave(&rq->lock, flags); ++ if (is_idle_task(rq->curr)) ++ smp_send_reschedule(cpu); ++ /* Else CPU is not idle, do nothing here */ ++ raw_spin_unlock_irqrestore(&rq->lock, flags); ++ } ++ ++out: ++ rcu_read_unlock(); ++} ++ ++bool cpus_share_cache(int this_cpu, int that_cpu) ++{ ++ return per_cpu(sd_llc_id, this_cpu) == per_cpu(sd_llc_id, that_cpu); ++} ++#endif /* CONFIG_SMP */ ++ ++/* ++ * Topology list, bottom-up. ++ */ ++static struct sched_domain_topology_level default_topology[] = { ++#ifdef CONFIG_SCHED_SMT ++ { cpu_smt_mask, cpu_smt_flags, SD_INIT_NAME(SMT) }, ++#endif ++#ifdef CONFIG_SCHED_MC ++ { cpu_coregroup_mask, cpu_core_flags, SD_INIT_NAME(MC) }, ++#endif ++ { cpu_cpu_mask, SD_INIT_NAME(DIE) }, ++ { NULL, }, ++}; ++ ++static struct sched_domain_topology_level *sched_domain_topology = ++ default_topology; ++ ++#define for_each_sd_topology(tl) \ ++ for (tl = sched_domain_topology; tl->mask; tl++) ++ ++void set_sched_topology(struct sched_domain_topology_level *tl) ++{ ++ if (WARN_ON_ONCE(sched_smp_initialized)) ++ return; ++ ++ sched_domain_topology = tl; ++} ++ ++/* ++ * Initializers for schedule domains ++ * Non-inlined to reduce accumulated stack pressure in build_sched_domains() ++ */ ++ ++int sched_domain_level_max; ++ ++/* ++ * Partition sched domains as specified by the 'ndoms_new' ++ * cpumasks in the array doms_new[] of cpumasks. This compares ++ * doms_new[] to the current sched domain partitioning, doms_cur[]. ++ * It destroys each deleted domain and builds each new domain. ++ * ++ * 'doms_new' is an array of cpumask_var_t's of length 'ndoms_new'. ++ * The masks don't intersect (don't overlap.) We should setup one ++ * sched domain for each mask. CPUs not in any of the cpumasks will ++ * not be load balanced. If the same cpumask appears both in the ++ * current 'doms_cur' domains and in the new 'doms_new', we can leave ++ * it as it is. ++ * ++ * The passed in 'doms_new' should be allocated using ++ * alloc_sched_domains. This routine takes ownership of it and will ++ * free_sched_domains it when done with it. If the caller failed the ++ * alloc call, then it can pass in doms_new == NULL && ndoms_new == 1, ++ * and partition_sched_domains() will fallback to the single partition ++ * 'fallback_doms', it also forces the domains to be rebuilt. ++ * ++ * If doms_new == NULL it will be replaced with cpu_online_mask. ++ * ndoms_new == 0 is a special case for destroying existing domains, ++ * and it will not create the default domain. ++ * ++ * Call with hotplug lock held ++ */ ++void partition_sched_domains(int ndoms_new, cpumask_var_t doms_new[], ++ struct sched_domain_attr *dattr_new) ++{ ++ /** ++ * BMQ doesn't depend on sched domains, but just keep this api ++ */ ++} ++ ++/* ++ * used to mark begin/end of suspend/resume: ++ */ ++static int num_cpus_frozen; ++ ++#ifdef CONFIG_NUMA ++int __read_mostly node_reclaim_distance = RECLAIM_DISTANCE; ++ ++/* ++ * sched_numa_find_closest() - given the NUMA topology, find the cpu ++ * closest to @cpu from @cpumask. ++ * cpumask: cpumask to find a cpu from ++ * cpu: cpu to be close to ++ * ++ * returns: cpu, or nr_cpu_ids when nothing found. ++ */ ++int sched_numa_find_closest(const struct cpumask *cpus, int cpu) ++{ ++ return best_mask_cpu(cpu, cpus); ++} ++#endif /* CONFIG_NUMA */ ++ ++/* ++ * Update cpusets according to cpu_active mask. If cpusets are ++ * disabled, cpuset_update_active_cpus() becomes a simple wrapper ++ * around partition_sched_domains(). ++ * ++ * If we come here as part of a suspend/resume, don't touch cpusets because we ++ * want to restore it back to its original state upon resume anyway. ++ */ ++static void cpuset_cpu_active(void) ++{ ++ if (cpuhp_tasks_frozen) { ++ /* ++ * num_cpus_frozen tracks how many CPUs are involved in suspend ++ * resume sequence. As long as this is not the last online ++ * operation in the resume sequence, just build a single sched ++ * domain, ignoring cpusets. ++ */ ++ partition_sched_domains(1, NULL, NULL); ++ if (--num_cpus_frozen) ++ return; ++ /* ++ * This is the last CPU online operation. So fall through and ++ * restore the original sched domains by considering the ++ * cpuset configurations. ++ */ ++ cpuset_force_rebuild(); ++ } ++ ++ cpuset_update_active_cpus(); ++} ++ ++static int cpuset_cpu_inactive(unsigned int cpu) ++{ ++ if (!cpuhp_tasks_frozen) { ++ cpuset_update_active_cpus(); ++ } else { ++ num_cpus_frozen++; ++ partition_sched_domains(1, NULL, NULL); ++ } ++ return 0; ++} ++ ++int sched_cpu_activate(unsigned int cpu) ++{ ++ struct rq *rq = cpu_rq(cpu); ++ unsigned long flags; ++ ++#ifdef CONFIG_SCHED_SMT ++ /* ++ * When going up, increment the number of cores with SMT present. ++ */ ++ if (cpumask_weight(cpu_smt_mask(cpu)) == 2) ++ static_branch_inc_cpuslocked(&sched_smt_present); ++#endif ++ set_cpu_active(cpu, true); ++ ++ if (sched_smp_initialized) ++ cpuset_cpu_active(); ++ ++ /* ++ * Put the rq online, if not already. This happens: ++ * ++ * 1) In the early boot process, because we build the real domains ++ * after all cpus have been brought up. ++ * ++ * 2) At runtime, if cpuset_cpu_active() fails to rebuild the ++ * domains. ++ */ ++ raw_spin_lock_irqsave(&rq->lock, flags); ++ set_rq_online(rq); ++ raw_spin_unlock_irqrestore(&rq->lock, flags); ++ ++ return 0; ++} ++ ++int sched_cpu_deactivate(unsigned int cpu) ++{ ++ int ret; ++ ++ set_cpu_active(cpu, false); ++ /* ++ * We've cleared cpu_active_mask, wait for all preempt-disabled and RCU ++ * users of this state to go away such that all new such users will ++ * observe it. ++ * ++ * Do sync before park smpboot threads to take care the rcu boost case. ++ */ ++ synchronize_rcu(); ++ ++#ifdef CONFIG_SCHED_SMT ++ /* ++ * When going down, decrement the number of cores with SMT present. ++ */ ++ if (cpumask_weight(cpu_smt_mask(cpu)) == 2) { ++ static_branch_dec_cpuslocked(&sched_smt_present); ++ if (!static_branch_likely(&sched_smt_present)) ++ cpumask_clear(&sched_sg_idle_mask); ++ } ++#endif ++ ++ if (!sched_smp_initialized) ++ return 0; ++ ++ ret = cpuset_cpu_inactive(cpu); ++ if (ret) { ++ set_cpu_active(cpu, true); ++ return ret; ++ } ++ return 0; ++} ++ ++static void sched_rq_cpu_starting(unsigned int cpu) ++{ ++ struct rq *rq = cpu_rq(cpu); ++ ++ rq->calc_load_update = calc_load_update; ++} ++ ++int sched_cpu_starting(unsigned int cpu) ++{ ++ sched_rq_cpu_starting(cpu); ++ sched_tick_start(cpu); ++ return 0; ++} ++ ++#ifdef CONFIG_HOTPLUG_CPU ++int sched_cpu_dying(unsigned int cpu) ++{ ++ struct rq *rq = cpu_rq(cpu); ++ unsigned long flags; ++ ++ sched_tick_stop(cpu); ++ raw_spin_lock_irqsave(&rq->lock, flags); ++ set_rq_offline(rq); ++ migrate_tasks(rq); ++ raw_spin_unlock_irqrestore(&rq->lock, flags); ++ ++ hrtick_clear(rq); ++ return 0; ++} ++#endif ++ ++#ifdef CONFIG_SMP ++static void sched_init_topology_cpumask_early(void) ++{ ++ int cpu, level; ++ cpumask_t *tmp; ++ ++ for_each_possible_cpu(cpu) { ++ for (level = 0; level < NR_CPU_AFFINITY_CHK_LEVEL; level++) { ++ tmp = &(per_cpu(sched_cpu_affinity_masks, cpu)[level]); ++ cpumask_copy(tmp, cpu_possible_mask); ++ cpumask_clear_cpu(cpu, tmp); ++ } ++ per_cpu(sched_cpu_affinity_end_mask, cpu) = ++ &(per_cpu(sched_cpu_affinity_masks, cpu)[1]); ++ } ++} ++ ++static void sched_init_topology_cpumask(void) ++{ ++ int cpu; ++ cpumask_t *chk; ++ ++ for_each_online_cpu(cpu) { ++ chk = &(per_cpu(sched_cpu_affinity_masks, cpu)[0]); ++ ++#ifdef CONFIG_SCHED_SMT ++ cpumask_setall(chk); ++ cpumask_clear_cpu(cpu, chk); ++ if (cpumask_and(chk, chk, topology_sibling_cpumask(cpu))) { ++ printk(KERN_INFO "bmq: cpu #%d affinity check mask - smt 0x%08lx", ++ cpu, (chk++)->bits[0]); ++ } ++ cpumask_complement(chk, topology_sibling_cpumask(cpu)); ++#else ++ cpumask_clear_cpu(cpu, chk); ++#endif ++#ifdef CONFIG_SCHED_MC ++ if (cpumask_and(chk, chk, cpu_coregroup_mask(cpu))) ++ printk(KERN_INFO "bmq: cpu #%d affinity check mask - coregroup 0x%08lx", ++ cpu, (chk++)->bits[0]); ++ cpumask_complement(chk, cpu_coregroup_mask(cpu)); ++ ++ /** ++ * Set up sd_llc_id per CPU ++ */ ++ per_cpu(sd_llc_id, cpu) = ++ cpumask_first(cpu_coregroup_mask(cpu)); ++#else ++ per_cpu(sd_llc_id, cpu) = ++ cpumask_first(topology_core_cpumask(cpu)); ++ ++ cpumask_setall(chk); ++ cpumask_clear_cpu(cpu, chk); ++#endif /* NOT CONFIG_SCHED_MC */ ++ if (cpumask_and(chk, chk, topology_core_cpumask(cpu))) ++ printk(KERN_INFO "bmq: cpu #%d affinity check mask - core 0x%08lx", ++ cpu, (chk++)->bits[0]); ++ cpumask_complement(chk, topology_core_cpumask(cpu)); ++ ++ if (cpumask_and(chk, chk, cpu_online_mask)) ++ printk(KERN_INFO "bmq: cpu #%d affinity check mask - others 0x%08lx", ++ cpu, (chk++)->bits[0]); ++ ++ per_cpu(sched_cpu_affinity_end_mask, cpu) = chk; ++ } ++} ++#endif ++ ++void __init sched_init_smp(void) ++{ ++ /* Move init over to a non-isolated CPU */ ++ if (set_cpus_allowed_ptr(current, housekeeping_cpumask(HK_FLAG_DOMAIN)) < 0) ++ BUG(); ++ ++ sched_init_topology_cpumask(); ++ ++ sched_smp_initialized = true; ++} ++#else ++void __init sched_init_smp(void) ++{ ++} ++#endif /* CONFIG_SMP */ ++ ++int in_sched_functions(unsigned long addr) ++{ ++ return in_lock_functions(addr) || ++ (addr >= (unsigned long)__sched_text_start ++ && addr < (unsigned long)__sched_text_end); ++} ++ ++#ifdef CONFIG_CGROUP_SCHED ++/* task group related information */ ++struct task_group { ++ struct cgroup_subsys_state css; ++ ++ struct rcu_head rcu; ++ struct list_head list; ++ ++ struct task_group *parent; ++ struct list_head siblings; ++ struct list_head children; ++}; ++ ++/* ++ * Default task group. ++ * Every task in system belongs to this group at bootup. ++ */ ++struct task_group root_task_group; ++LIST_HEAD(task_groups); ++ ++/* Cacheline aligned slab cache for task_group */ ++static struct kmem_cache *task_group_cache __read_mostly; ++#endif /* CONFIG_CGROUP_SCHED */ ++ ++void __init sched_init(void) ++{ ++ int i; ++ struct rq *rq; ++ ++ print_scheduler_version(); ++ ++ wait_bit_init(); ++ ++#ifdef CONFIG_SMP ++ for (i = 0; i < bmq_BITS; i++) ++ cpumask_copy(&sched_rq_watermark[i], cpu_present_mask); ++#endif ++ ++#ifdef CONFIG_CGROUP_SCHED ++ task_group_cache = KMEM_CACHE(task_group, 0); ++ ++ list_add(&root_task_group.list, &task_groups); ++ INIT_LIST_HEAD(&root_task_group.children); ++ INIT_LIST_HEAD(&root_task_group.siblings); ++#endif /* CONFIG_CGROUP_SCHED */ ++ for_each_possible_cpu(i) { ++ rq = cpu_rq(i); ++ ++ bmq_init(&rq->queue); ++ rq->watermark = IDLE_WM; ++ rq->skip = NULL; ++ ++ raw_spin_lock_init(&rq->lock); ++ rq->nr_running = rq->nr_uninterruptible = 0; ++ rq->calc_load_active = 0; ++ rq->calc_load_update = jiffies + LOAD_FREQ; ++#ifdef CONFIG_SMP ++ rq->online = false; ++ rq->cpu = i; ++ ++#ifdef CONFIG_SCHED_SMT ++ rq->active_balance = 0; ++#endif ++#endif ++ rq->nr_switches = 0; ++ atomic_set(&rq->nr_iowait, 0); ++ hrtick_rq_init(rq); ++ } ++#ifdef CONFIG_SMP ++ /* Set rq->online for cpu 0 */ ++ cpu_rq(0)->online = true; ++#endif ++ ++ /* ++ * The boot idle thread does lazy MMU switching as well: ++ */ ++ mmgrab(&init_mm); ++ enter_lazy_tlb(&init_mm, current); ++ ++ /* ++ * Make us the idle thread. Technically, schedule() should not be ++ * called from this thread, however somewhere below it might be, ++ * but because we are the idle thread, we just pick up running again ++ * when this runqueue becomes "idle". ++ */ ++ init_idle(current, smp_processor_id()); ++ ++ calc_load_update = jiffies + LOAD_FREQ; ++ ++#ifdef CONFIG_SMP ++ idle_thread_set_boot_cpu(); ++ ++ sched_init_topology_cpumask_early(); ++#endif /* SMP */ ++ ++ init_schedstats(); ++ ++ psi_init(); ++} ++ ++#ifdef CONFIG_DEBUG_ATOMIC_SLEEP ++static inline int preempt_count_equals(int preempt_offset) ++{ ++ int nested = preempt_count() + rcu_preempt_depth(); ++ ++ return (nested == preempt_offset); ++} ++ ++void __might_sleep(const char *file, int line, int preempt_offset) ++{ ++ /* ++ * Blocking primitives will set (and therefore destroy) current->state, ++ * since we will exit with TASK_RUNNING make sure we enter with it, ++ * otherwise we will destroy state. ++ */ ++ WARN_ONCE(current->state != TASK_RUNNING && current->task_state_change, ++ "do not call blocking ops when !TASK_RUNNING; " ++ "state=%lx set at [<%p>] %pS\n", ++ current->state, ++ (void *)current->task_state_change, ++ (void *)current->task_state_change); ++ ++ ___might_sleep(file, line, preempt_offset); ++} ++EXPORT_SYMBOL(__might_sleep); ++ ++void ___might_sleep(const char *file, int line, int preempt_offset) ++{ ++ /* Ratelimiting timestamp: */ ++ static unsigned long prev_jiffy; ++ ++ unsigned long preempt_disable_ip; ++ ++ /* WARN_ON_ONCE() by default, no rate limit required: */ ++ rcu_sleep_check(); ++ ++ if ((preempt_count_equals(preempt_offset) && !irqs_disabled() && ++ !is_idle_task(current) && !current->non_block_count) || ++ system_state == SYSTEM_BOOTING || system_state > SYSTEM_RUNNING || ++ oops_in_progress) ++ return; ++ if (time_before(jiffies, prev_jiffy + HZ) && prev_jiffy) ++ return; ++ prev_jiffy = jiffies; ++ ++ /* Save this before calling printk(), since that will clobber it: */ ++ preempt_disable_ip = get_preempt_disable_ip(current); ++ ++ printk(KERN_ERR ++ "BUG: sleeping function called from invalid context at %s:%d\n", ++ file, line); ++ printk(KERN_ERR ++ "in_atomic(): %d, irqs_disabled(): %d, non_block: %d, pid: %d, name: %s\n", ++ in_atomic(), irqs_disabled(), current->non_block_count, ++ current->pid, current->comm); ++ ++ if (task_stack_end_corrupted(current)) ++ printk(KERN_EMERG "Thread overran stack, or stack corrupted\n"); ++ ++ debug_show_held_locks(current); ++ if (irqs_disabled()) ++ print_irqtrace_events(current); ++#ifdef CONFIG_DEBUG_PREEMPT ++ if (!preempt_count_equals(preempt_offset)) { ++ pr_err("Preemption disabled at:"); ++ print_ip_sym(preempt_disable_ip); ++ pr_cont("\n"); ++ } ++#endif ++ dump_stack(); ++ add_taint(TAINT_WARN, LOCKDEP_STILL_OK); ++} ++EXPORT_SYMBOL(___might_sleep); ++ ++void __cant_sleep(const char *file, int line, int preempt_offset) ++{ ++ static unsigned long prev_jiffy; ++ ++ if (irqs_disabled()) ++ return; ++ ++ if (!IS_ENABLED(CONFIG_PREEMPT_COUNT)) ++ return; ++ ++ if (preempt_count() > preempt_offset) ++ return; ++ ++ if (time_before(jiffies, prev_jiffy + HZ) && prev_jiffy) ++ return; ++ prev_jiffy = jiffies; ++ ++ printk(KERN_ERR "BUG: assuming atomic context at %s:%d\n", file, line); ++ printk(KERN_ERR "in_atomic(): %d, irqs_disabled(): %d, pid: %d, name: %s\n", ++ in_atomic(), irqs_disabled(), ++ current->pid, current->comm); ++ ++ debug_show_held_locks(current); ++ dump_stack(); ++ add_taint(TAINT_WARN, LOCKDEP_STILL_OK); ++} ++EXPORT_SYMBOL_GPL(__cant_sleep); ++#endif ++ ++#ifdef CONFIG_MAGIC_SYSRQ ++void normalize_rt_tasks(void) ++{ ++ struct task_struct *g, *p; ++ struct sched_attr attr = { ++ .sched_policy = SCHED_NORMAL, ++ }; ++ ++ read_lock(&tasklist_lock); ++ for_each_process_thread(g, p) { ++ /* ++ * Only normalize user tasks: ++ */ ++ if (p->flags & PF_KTHREAD) ++ continue; ++ ++ if (!rt_task(p)) { ++ /* ++ * Renice negative nice level userspace ++ * tasks back to 0: ++ */ ++ if (task_nice(p) < 0) ++ set_user_nice(p, 0); ++ continue; ++ } ++ ++ __sched_setscheduler(p, &attr, false, false); ++ } ++ read_unlock(&tasklist_lock); ++} ++#endif /* CONFIG_MAGIC_SYSRQ */ ++ ++#if defined(CONFIG_IA64) || defined(CONFIG_KGDB_KDB) ++/* ++ * These functions are only useful for the IA64 MCA handling, or kdb. ++ * ++ * They can only be called when the whole system has been ++ * stopped - every CPU needs to be quiescent, and no scheduling ++ * activity can take place. Using them for anything else would ++ * be a serious bug, and as a result, they aren't even visible ++ * under any other configuration. ++ */ ++ ++/** ++ * curr_task - return the current task for a given CPU. ++ * @cpu: the processor in question. ++ * ++ * ONLY VALID WHEN THE WHOLE SYSTEM IS STOPPED! ++ * ++ * Return: The current task for @cpu. ++ */ ++struct task_struct *curr_task(int cpu) ++{ ++ return cpu_curr(cpu); ++} ++ ++#endif /* defined(CONFIG_IA64) || defined(CONFIG_KGDB_KDB) */ ++ ++#ifdef CONFIG_IA64 ++/** ++ * ia64_set_curr_task - set the current task for a given CPU. ++ * @cpu: the processor in question. ++ * @p: the task pointer to set. ++ * ++ * Description: This function must only be used when non-maskable interrupts ++ * are serviced on a separate stack. It allows the architecture to switch the ++ * notion of the current task on a CPU in a non-blocking manner. This function ++ * must be called with all CPU's synchronised, and interrupts disabled, the ++ * and caller must save the original value of the current task (see ++ * curr_task() above) and restore that value before reenabling interrupts and ++ * re-starting the system. ++ * ++ * ONLY VALID WHEN THE WHOLE SYSTEM IS STOPPED! ++ */ ++void ia64_set_curr_task(int cpu, struct task_struct *p) ++{ ++ cpu_curr(cpu) = p; ++} ++ ++#endif ++ ++#ifdef CONFIG_SCHED_DEBUG ++void proc_sched_show_task(struct task_struct *p, struct pid_namespace *ns, ++ struct seq_file *m) ++{ ++ SEQ_printf(m, "%s (%d, #threads: %d)\n", p->comm, task_pid_nr_ns(p, ns), ++ get_nr_threads(p)); ++} ++ ++void proc_sched_set_task(struct task_struct *p) ++{} ++#endif ++ ++#ifdef CONFIG_CGROUP_SCHED ++static void sched_free_group(struct task_group *tg) ++{ ++ kmem_cache_free(task_group_cache, tg); ++} ++ ++/* allocate runqueue etc for a new task group */ ++struct task_group *sched_create_group(struct task_group *parent) ++{ ++ struct task_group *tg; ++ ++ tg = kmem_cache_alloc(task_group_cache, GFP_KERNEL | __GFP_ZERO); ++ if (!tg) ++ return ERR_PTR(-ENOMEM); ++ ++ return tg; ++} ++ ++void sched_online_group(struct task_group *tg, struct task_group *parent) ++{ ++} ++ ++/* rcu callback to free various structures associated with a task group */ ++static void sched_free_group_rcu(struct rcu_head *rhp) ++{ ++ /* Now it should be safe to free those cfs_rqs */ ++ sched_free_group(container_of(rhp, struct task_group, rcu)); ++} ++ ++void sched_destroy_group(struct task_group *tg) ++{ ++ /* Wait for possible concurrent references to cfs_rqs complete */ ++ call_rcu(&tg->rcu, sched_free_group_rcu); ++} ++ ++void sched_offline_group(struct task_group *tg) ++{ ++} ++ ++static inline struct task_group *css_tg(struct cgroup_subsys_state *css) ++{ ++ return css ? container_of(css, struct task_group, css) : NULL; ++} ++ ++static struct cgroup_subsys_state * ++cpu_cgroup_css_alloc(struct cgroup_subsys_state *parent_css) ++{ ++ struct task_group *parent = css_tg(parent_css); ++ struct task_group *tg; ++ ++ if (!parent) { ++ /* This is early initialization for the top cgroup */ ++ return &root_task_group.css; ++ } ++ ++ tg = sched_create_group(parent); ++ if (IS_ERR(tg)) ++ return ERR_PTR(-ENOMEM); ++ return &tg->css; ++} ++ ++/* Expose task group only after completing cgroup initialization */ ++static int cpu_cgroup_css_online(struct cgroup_subsys_state *css) ++{ ++ struct task_group *tg = css_tg(css); ++ struct task_group *parent = css_tg(css->parent); ++ ++ if (parent) ++ sched_online_group(tg, parent); ++ return 0; ++} ++ ++static void cpu_cgroup_css_released(struct cgroup_subsys_state *css) ++{ ++ struct task_group *tg = css_tg(css); ++ ++ sched_offline_group(tg); ++} ++ ++static void cpu_cgroup_css_free(struct cgroup_subsys_state *css) ++{ ++ struct task_group *tg = css_tg(css); ++ ++ /* ++ * Relies on the RCU grace period between css_released() and this. ++ */ ++ sched_free_group(tg); ++} ++ ++static void cpu_cgroup_fork(struct task_struct *task) ++{ ++} ++ ++static int cpu_cgroup_can_attach(struct cgroup_taskset *tset) ++{ ++ return 0; ++} ++ ++static void cpu_cgroup_attach(struct cgroup_taskset *tset) ++{ ++} ++ ++static struct cftype cpu_legacy_files[] = { ++ { } /* Terminate */ ++}; ++ ++static struct cftype cpu_files[] = { ++ { } /* terminate */ ++}; ++ ++static int cpu_extra_stat_show(struct seq_file *sf, ++ struct cgroup_subsys_state *css) ++{ ++ return 0; ++} ++ ++struct cgroup_subsys cpu_cgrp_subsys = { ++ .css_alloc = cpu_cgroup_css_alloc, ++ .css_online = cpu_cgroup_css_online, ++ .css_released = cpu_cgroup_css_released, ++ .css_free = cpu_cgroup_css_free, ++ .css_extra_stat_show = cpu_extra_stat_show, ++ .fork = cpu_cgroup_fork, ++ .can_attach = cpu_cgroup_can_attach, ++ .attach = cpu_cgroup_attach, ++ .legacy_cftypes = cpu_files, ++ .legacy_cftypes = cpu_legacy_files, ++ .dfl_cftypes = cpu_files, ++ .early_init = true, ++ .threaded = true, ++}; ++#endif /* CONFIG_CGROUP_SCHED */ ++ ++#undef CREATE_TRACE_POINTS +diff --git a/kernel/sched/bmq_sched.h b/kernel/sched/bmq_sched.h +new file mode 100644 +index 000000000000..ed08dd0b8227 +--- /dev/null ++++ b/kernel/sched/bmq_sched.h +@@ -0,0 +1,472 @@ ++#ifndef BMQ_SCHED_H ++#define BMQ_SCHED_H ++ ++#include ++ ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++ ++#include ++ ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++ ++#include ++ ++#ifdef CONFIG_PARAVIRT ++# include ++#endif ++ ++#include "cpupri.h" ++ ++/* task_struct::on_rq states: */ ++#define TASK_ON_RQ_QUEUED 1 ++#define TASK_ON_RQ_MIGRATING 2 ++ ++static inline int task_on_rq_queued(struct task_struct *p) ++{ ++ return p->on_rq == TASK_ON_RQ_QUEUED; ++} ++ ++static inline int task_on_rq_migrating(struct task_struct *p) ++{ ++ return READ_ONCE(p->on_rq) == TASK_ON_RQ_MIGRATING; ++} ++ ++/* bits: ++ * RT, Low prio adj range, nice width, high prio adj range, cpu idle task */ ++#define bmq_BITS (NICE_WIDTH + 2 * MAX_PRIORITY_ADJ + 2) ++#define IDLE_TASK_SCHED_PRIO (bmq_BITS - 1) ++ ++struct bmq { ++ DECLARE_BITMAP(bitmap, bmq_BITS); ++ struct list_head heads[bmq_BITS]; ++}; ++ ++/* ++ * This is the main, per-CPU runqueue data structure. ++ * This data should only be modified by the local cpu. ++ */ ++struct rq { ++ /* runqueue lock: */ ++ raw_spinlock_t lock; ++ ++ struct task_struct *curr, *idle, *stop, *skip; ++ struct mm_struct *prev_mm; ++ ++ struct bmq queue; ++ unsigned long watermark; ++ ++ /* switch count */ ++ u64 nr_switches; ++ ++ atomic_t nr_iowait; ++ ++#ifdef CONFIG_MEMBARRIER ++ int membarrier_state; ++#endif ++ ++#ifdef CONFIG_SMP ++ int cpu; /* cpu of this runqueue */ ++ bool online; ++ ++#ifdef CONFIG_HAVE_SCHED_AVG_IRQ ++ struct sched_avg avg_irq; ++#endif ++ ++#ifdef CONFIG_SCHED_SMT ++ int active_balance; ++ struct cpu_stop_work active_balance_work; ++#endif ++#endif /* CONFIG_SMP */ ++#ifdef CONFIG_IRQ_TIME_ACCOUNTING ++ u64 prev_irq_time; ++#endif /* CONFIG_IRQ_TIME_ACCOUNTING */ ++#ifdef CONFIG_PARAVIRT ++ u64 prev_steal_time; ++#endif /* CONFIG_PARAVIRT */ ++#ifdef CONFIG_PARAVIRT_TIME_ACCOUNTING ++ u64 prev_steal_time_rq; ++#endif /* CONFIG_PARAVIRT_TIME_ACCOUNTING */ ++ ++ /* calc_load related fields */ ++ unsigned long calc_load_update; ++ long calc_load_active; ++ ++ u64 clock, last_tick; ++ u64 last_ts_switch; ++ u64 clock_task; ++ ++ unsigned long nr_running; ++ unsigned long nr_uninterruptible; ++ ++#ifdef CONFIG_SCHED_HRTICK ++#ifdef CONFIG_SMP ++ int hrtick_csd_pending; ++ call_single_data_t hrtick_csd; ++#endif ++ struct hrtimer hrtick_timer; ++#endif ++ ++#ifdef CONFIG_SCHEDSTATS ++ ++ /* latency stats */ ++ struct sched_info rq_sched_info; ++ unsigned long long rq_cpu_time; ++ /* could above be rq->cfs_rq.exec_clock + rq->rt_rq.rt_runtime ? */ ++ ++ /* sys_sched_yield() stats */ ++ unsigned int yld_count; ++ ++ /* schedule() stats */ ++ unsigned int sched_switch; ++ unsigned int sched_count; ++ unsigned int sched_goidle; ++ ++ /* try_to_wake_up() stats */ ++ unsigned int ttwu_count; ++ unsigned int ttwu_local; ++#endif /* CONFIG_SCHEDSTATS */ ++#ifdef CONFIG_CPU_IDLE ++ /* Must be inspected within a rcu lock section */ ++ struct cpuidle_state *idle_state; ++#endif ++}; ++ ++extern unsigned long calc_load_update; ++extern atomic_long_t calc_load_tasks; ++ ++extern void calc_global_load_tick(struct rq *this_rq); ++extern long calc_load_fold_active(struct rq *this_rq, long adjust); ++ ++DECLARE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues); ++#define cpu_rq(cpu) (&per_cpu(runqueues, (cpu))) ++#define this_rq() this_cpu_ptr(&runqueues) ++#define task_rq(p) cpu_rq(task_cpu(p)) ++#define cpu_curr(cpu) (cpu_rq(cpu)->curr) ++#define raw_rq() raw_cpu_ptr(&runqueues) ++ ++#ifdef CONFIG_SMP ++#if defined(CONFIG_SCHED_DEBUG) && defined(CONFIG_SYSCTL) ++void register_sched_domain_sysctl(void); ++void unregister_sched_domain_sysctl(void); ++#else ++static inline void register_sched_domain_sysctl(void) ++{ ++} ++static inline void unregister_sched_domain_sysctl(void) ++{ ++} ++#endif ++#endif /* CONFIG_SMP */ ++ ++#ifndef arch_scale_freq_capacity ++static __always_inline ++unsigned long arch_scale_freq_capacity(int cpu) ++{ ++ return SCHED_CAPACITY_SCALE; ++} ++#endif ++ ++static inline u64 __rq_clock_broken(struct rq *rq) ++{ ++ return READ_ONCE(rq->clock); ++} ++ ++static inline u64 rq_clock(struct rq *rq) ++{ ++ /* ++ * Relax lockdep_assert_held() checking as in VRQ, call to ++ * sched_info_xxxx() may not held rq->lock ++ * lockdep_assert_held(&rq->lock); ++ */ ++ return rq->clock; ++} ++ ++static inline u64 rq_clock_task(struct rq *rq) ++{ ++ /* ++ * Relax lockdep_assert_held() checking as in VRQ, call to ++ * sched_info_xxxx() may not held rq->lock ++ * lockdep_assert_held(&rq->lock); ++ */ ++ return rq->clock_task; ++} ++ ++/* ++ * {de,en}queue flags: ++ * ++ * DEQUEUE_SLEEP - task is no longer runnable ++ * ENQUEUE_WAKEUP - task just became runnable ++ * ++ */ ++ ++#define DEQUEUE_SLEEP 0x01 ++ ++#define ENQUEUE_WAKEUP 0x01 ++ ++ ++/* ++ * Below are scheduler API which using in other kernel code ++ * It use the dummy rq_flags ++ * ToDo : BMQ need to support these APIs for compatibility with mainline ++ * scheduler code. ++ */ ++struct rq_flags { ++ unsigned long flags; ++}; ++ ++struct rq *__task_rq_lock(struct task_struct *p, struct rq_flags *rf) ++ __acquires(rq->lock); ++ ++struct rq *task_rq_lock(struct task_struct *p, struct rq_flags *rf) ++ __acquires(p->pi_lock) ++ __acquires(rq->lock); ++ ++static inline void __task_rq_unlock(struct rq *rq, struct rq_flags *rf) ++ __releases(rq->lock) ++{ ++ raw_spin_unlock(&rq->lock); ++} ++ ++static inline void ++task_rq_unlock(struct rq *rq, struct task_struct *p, struct rq_flags *rf) ++ __releases(rq->lock) ++ __releases(p->pi_lock) ++{ ++ raw_spin_unlock(&rq->lock); ++ raw_spin_unlock_irqrestore(&p->pi_lock, rf->flags); ++} ++ ++static inline void ++rq_unlock_irq(struct rq *rq, struct rq_flags *rf) ++ __releases(rq->lock) ++{ ++ raw_spin_unlock_irq(&rq->lock); ++} ++ ++static inline struct rq * ++this_rq_lock_irq(struct rq_flags *rf) ++ __acquires(rq->lock) ++{ ++ struct rq *rq; ++ ++ local_irq_disable(); ++ rq = this_rq(); ++ raw_spin_lock(&rq->lock); ++ ++ return rq; ++} ++ ++static inline bool task_running(struct task_struct *p) ++{ ++ return p->on_cpu; ++} ++ ++extern struct static_key_false sched_schedstats; ++ ++static inline void sched_ttwu_pending(void) { } ++ ++#ifdef CONFIG_CPU_IDLE ++static inline void idle_set_state(struct rq *rq, ++ struct cpuidle_state *idle_state) ++{ ++ rq->idle_state = idle_state; ++} ++ ++static inline struct cpuidle_state *idle_get_state(struct rq *rq) ++{ ++ WARN_ON(!rcu_read_lock_held()); ++ return rq->idle_state; ++} ++#else ++static inline void idle_set_state(struct rq *rq, ++ struct cpuidle_state *idle_state) ++{ ++} ++ ++static inline struct cpuidle_state *idle_get_state(struct rq *rq) ++{ ++ return NULL; ++} ++#endif ++ ++static inline int cpu_of(const struct rq *rq) ++{ ++#ifdef CONFIG_SMP ++ return rq->cpu; ++#else ++ return 0; ++#endif ++} ++ ++#include "stats.h" ++ ++#ifdef CONFIG_IRQ_TIME_ACCOUNTING ++struct irqtime { ++ u64 total; ++ u64 tick_delta; ++ u64 irq_start_time; ++ struct u64_stats_sync sync; ++}; ++ ++DECLARE_PER_CPU(struct irqtime, cpu_irqtime); ++ ++/* ++ * Returns the irqtime minus the softirq time computed by ksoftirqd. ++ * Otherwise ksoftirqd's sum_exec_runtime is substracted its own runtime ++ * and never move forward. ++ */ ++static inline u64 irq_time_read(int cpu) ++{ ++ struct irqtime *irqtime = &per_cpu(cpu_irqtime, cpu); ++ unsigned int seq; ++ u64 total; ++ ++ do { ++ seq = __u64_stats_fetch_begin(&irqtime->sync); ++ total = irqtime->total; ++ } while (__u64_stats_fetch_retry(&irqtime->sync, seq)); ++ ++ return total; ++} ++#endif /* CONFIG_IRQ_TIME_ACCOUNTING */ ++ ++#ifdef CONFIG_CPU_FREQ ++DECLARE_PER_CPU(struct update_util_data __rcu *, cpufreq_update_util_data); ++ ++/** ++ * cpufreq_update_util - Take a note about CPU utilization changes. ++ * @rq: Runqueue to carry out the update for. ++ * @flags: Update reason flags. ++ * ++ * This function is called by the scheduler on the CPU whose utilization is ++ * being updated. ++ * ++ * It can only be called from RCU-sched read-side critical sections. ++ * ++ * The way cpufreq is currently arranged requires it to evaluate the CPU ++ * performance state (frequency/voltage) on a regular basis to prevent it from ++ * being stuck in a completely inadequate performance level for too long. ++ * That is not guaranteed to happen if the updates are only triggered from CFS ++ * and DL, though, because they may not be coming in if only RT tasks are ++ * active all the time (or there are RT tasks only). ++ * ++ * As a workaround for that issue, this function is called periodically by the ++ * RT sched class to trigger extra cpufreq updates to prevent it from stalling, ++ * but that really is a band-aid. Going forward it should be replaced with ++ * solutions targeted more specifically at RT tasks. ++ */ ++static inline void cpufreq_update_util(struct rq *rq, unsigned int flags) ++{ ++ struct update_util_data *data; ++ ++ data = rcu_dereference_sched(*this_cpu_ptr(&cpufreq_update_util_data)); ++ if (data) ++ data->func(data, rq_clock(rq), flags); ++} ++#else ++static inline void cpufreq_update_util(struct rq *rq, unsigned int flags) {} ++#endif /* CONFIG_CPU_FREQ */ ++ ++#ifdef CONFIG_NO_HZ_FULL ++extern int __init sched_tick_offload_init(void); ++#else ++static inline int sched_tick_offload_init(void) { return 0; } ++#endif ++ ++#ifdef arch_scale_freq_capacity ++#ifndef arch_scale_freq_invariant ++#define arch_scale_freq_invariant() (true) ++#endif ++#else /* arch_scale_freq_capacity */ ++#define arch_scale_freq_invariant() (false) ++#endif ++ ++extern void schedule_idle(void); ++ ++/* ++ * !! For sched_setattr_nocheck() (kernel) only !! ++ * ++ * This is actually gross. :( ++ * ++ * It is used to make schedutil kworker(s) higher priority than SCHED_DEADLINE ++ * tasks, but still be able to sleep. We need this on platforms that cannot ++ * atomically change clock frequency. Remove once fast switching will be ++ * available on such platforms. ++ * ++ * SUGOV stands for SchedUtil GOVernor. ++ */ ++#define SCHED_FLAG_SUGOV 0x10000000 ++ ++#ifdef CONFIG_MEMBARRIER ++/* ++ * The scheduler provides memory barriers required by membarrier between: ++ * - prior user-space memory accesses and store to rq->membarrier_state, ++ * - store to rq->membarrier_state and following user-space memory accesses. ++ * In the same way it provides those guarantees around store to rq->curr. ++ */ ++static inline void membarrier_switch_mm(struct rq *rq, ++ struct mm_struct *prev_mm, ++ struct mm_struct *next_mm) ++{ ++ int membarrier_state; ++ ++ if (prev_mm == next_mm) ++ return; ++ ++ membarrier_state = atomic_read(&next_mm->membarrier_state); ++ if (READ_ONCE(rq->membarrier_state) == membarrier_state) ++ return; ++ ++ WRITE_ONCE(rq->membarrier_state, membarrier_state); ++} ++#else ++static inline void membarrier_switch_mm(struct rq *rq, ++ struct mm_struct *prev_mm, ++ struct mm_struct *next_mm) ++{ ++} ++#endif ++ ++static inline int task_running_nice(struct task_struct *p) ++{ ++ return (p->prio + p->boost_prio > DEFAULT_PRIO + MAX_PRIORITY_ADJ); ++} ++ ++#ifdef CONFIG_NUMA ++extern int sched_numa_find_closest(const struct cpumask *cpus, int cpu); ++#else ++static inline int sched_numa_find_closest(const struct cpumask *cpus, int cpu) ++{ ++ return nr_cpu_ids; ++} ++#endif ++#endif /* BMQ_SCHED_H */ +diff --git a/kernel/sched/cpufreq_schedutil.c b/kernel/sched/cpufreq_schedutil.c +index 86800b4d5453..a816aafa6ba3 100644 +--- a/kernel/sched/cpufreq_schedutil.c ++++ b/kernel/sched/cpufreq_schedutil.c +@@ -185,6 +185,7 @@ static unsigned int get_next_freq(struct sugov_policy *sg_policy, + return cpufreq_driver_resolve_freq(policy, freq); + } + ++#ifndef CONFIG_SCHED_BMQ + /* + * This function computes an effective utilization for the given CPU, to be + * used for frequency selection given the linear relation: f = u * f_max. +@@ -302,6 +303,13 @@ static unsigned long sugov_get_util(struct sugov_cpu *sg_cpu) + + return schedutil_cpu_util(sg_cpu->cpu, util, max, FREQUENCY_UTIL, NULL); + } ++#else /* CONFIG_SCHED_BMQ */ ++static unsigned long sugov_get_util(struct sugov_cpu *sg_cpu) ++{ ++ sg_cpu->max = arch_scale_cpu_capacity(sg_cpu->cpu); ++ return sg_cpu->max; ++} ++#endif + + /** + * sugov_iowait_reset() - Reset the IO boost status of a CPU. +@@ -445,7 +453,9 @@ static inline bool sugov_cpu_is_busy(struct sugov_cpu *sg_cpu) { return false; } + */ + static inline void ignore_dl_rate_limit(struct sugov_cpu *sg_cpu, struct sugov_policy *sg_policy) + { ++#ifndef CONFIG_SCHED_BMQ + if (cpu_bw_dl(cpu_rq(sg_cpu->cpu)) > sg_cpu->bw_dl) ++#endif + sg_policy->limits_changed = true; + } + +@@ -688,6 +698,7 @@ static int sugov_kthread_create(struct sugov_policy *sg_policy) + } + + ret = sched_setattr_nocheck(thread, &attr); ++ + if (ret) { + kthread_stop(thread); + pr_warn("%s: failed to set SCHED_DEADLINE\n", __func__); +@@ -918,6 +929,7 @@ static int __init sugov_register(void) + fs_initcall(sugov_register); + + #ifdef CONFIG_ENERGY_MODEL ++#ifndef CONFIG_SCHED_BMQ + extern bool sched_energy_update; + extern struct mutex sched_energy_mutex; + +@@ -948,4 +960,10 @@ void sched_cpufreq_governor_change(struct cpufreq_policy *policy, + } + + } ++#else /* CONFIG_SCHED_BMQ */ ++void sched_cpufreq_governor_change(struct cpufreq_policy *policy, ++ struct cpufreq_governor *old_gov) ++{ ++} ++#endif + #endif +diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c +index 46ed4e1383e2..51460a446da0 100644 +--- a/kernel/sched/cputime.c ++++ b/kernel/sched/cputime.c +@@ -122,7 +122,7 @@ void account_user_time(struct task_struct *p, u64 cputime) + p->utime += cputime; + account_group_user_time(p, cputime); + +- index = (task_nice(p) > 0) ? CPUTIME_NICE : CPUTIME_USER; ++ index = task_running_nice(p) ? CPUTIME_NICE : CPUTIME_USER; + + /* Add user time to cpustat. */ + task_group_account_field(p, index, cputime); +@@ -146,7 +146,7 @@ void account_guest_time(struct task_struct *p, u64 cputime) + p->gtime += cputime; + + /* Add guest time to cpustat. */ +- if (task_nice(p) > 0) { ++ if (task_running_nice(p)) { + cpustat[CPUTIME_NICE] += cputime; + cpustat[CPUTIME_GUEST_NICE] += cputime; + } else { +@@ -269,7 +269,7 @@ static inline u64 account_other_time(u64 max) + #ifdef CONFIG_64BIT + static inline u64 read_sum_exec_runtime(struct task_struct *t) + { +- return t->se.sum_exec_runtime; ++ return tsk_seruntime(t); + } + #else + static u64 read_sum_exec_runtime(struct task_struct *t) +@@ -279,7 +279,7 @@ static u64 read_sum_exec_runtime(struct task_struct *t) + struct rq *rq; + + rq = task_rq_lock(t, &rf); +- ns = t->se.sum_exec_runtime; ++ ns = tsk_seruntime(t); + task_rq_unlock(rq, t, &rf); + + return ns; +@@ -663,7 +663,7 @@ void cputime_adjust(struct task_cputime *curr, struct prev_cputime *prev, + void task_cputime_adjusted(struct task_struct *p, u64 *ut, u64 *st) + { + struct task_cputime cputime = { +- .sum_exec_runtime = p->se.sum_exec_runtime, ++ .sum_exec_runtime = tsk_seruntime(p), + }; + + task_cputime(p, &cputime.utime, &cputime.stime); +diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c +index f65ef1e2f204..77bf219444fa 100644 +--- a/kernel/sched/idle.c ++++ b/kernel/sched/idle.c +@@ -355,6 +355,7 @@ void cpu_startup_entry(enum cpuhp_state state) + do_idle(); + } + ++#ifndef CONFIG_SCHED_BMQ + /* + * idle-task scheduling class. + */ +@@ -479,3 +480,4 @@ const struct sched_class idle_sched_class = { + .switched_to = switched_to_idle, + .update_curr = update_curr_idle, + }; ++#endif +diff --git a/kernel/sched/pelt.c b/kernel/sched/pelt.c +index a96db50d40e0..22c20e28b613 100644 +--- a/kernel/sched/pelt.c ++++ b/kernel/sched/pelt.c +@@ -236,6 +236,7 @@ ___update_load_avg(struct sched_avg *sa, unsigned long load, unsigned long runna + WRITE_ONCE(sa->util_avg, sa->util_sum / divider); + } + ++#ifndef CONFIG_SCHED_BMQ + /* + * sched_entity: + * +@@ -352,6 +353,7 @@ int update_dl_rq_load_avg(u64 now, struct rq *rq, int running) + + return 0; + } ++#endif + + #ifdef CONFIG_HAVE_SCHED_AVG_IRQ + /* +diff --git a/kernel/sched/pelt.h b/kernel/sched/pelt.h +index afff644da065..4da52afaeff8 100644 +--- a/kernel/sched/pelt.h ++++ b/kernel/sched/pelt.h +@@ -1,11 +1,13 @@ + #ifdef CONFIG_SMP + #include "sched-pelt.h" + ++#ifndef CONFIG_SCHED_BMQ + int __update_load_avg_blocked_se(u64 now, struct sched_entity *se); + int __update_load_avg_se(u64 now, struct cfs_rq *cfs_rq, struct sched_entity *se); + int __update_load_avg_cfs_rq(u64 now, struct cfs_rq *cfs_rq); + int update_rt_rq_load_avg(u64 now, struct rq *rq, int running); + int update_dl_rq_load_avg(u64 now, struct rq *rq, int running); ++#endif + + #ifdef CONFIG_HAVE_SCHED_AVG_IRQ + int update_irq_load_avg(struct rq *rq, u64 running); +@@ -17,6 +19,7 @@ update_irq_load_avg(struct rq *rq, u64 running) + } + #endif + ++#ifndef CONFIG_SCHED_BMQ + /* + * When a task is dequeued, its estimated utilization should not be update if + * its util_avg has not been updated at least once. +@@ -137,9 +140,11 @@ static inline u64 cfs_rq_clock_pelt(struct cfs_rq *cfs_rq) + return rq_clock_pelt(rq_of(cfs_rq)); + } + #endif ++#endif /* CONFIG_SCHED_BMQ */ + + #else + ++#ifndef CONFIG_SCHED_BMQ + static inline int + update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq) + { +@@ -157,6 +162,7 @@ update_dl_rq_load_avg(u64 now, struct rq *rq, int running) + { + return 0; + } ++#endif + + static inline int + update_irq_load_avg(struct rq *rq, u64 running) +diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h +index c8870c5bd7df..4bca9838b6f0 100644 +--- a/kernel/sched/sched.h ++++ b/kernel/sched/sched.h +@@ -2,6 +2,10 @@ + /* + * Scheduler internal types and methods: + */ ++#ifdef CONFIG_SCHED_BMQ ++#include "bmq_sched.h" ++#else ++ + #include + + #include +@@ -2496,3 +2500,9 @@ static inline void membarrier_switch_mm(struct rq *rq, + { + } + #endif ++ ++static inline int task_running_nice(struct task_struct *p) ++{ ++ return (task_nice(p) > 0); ++} ++#endif /* !CONFIG_SCHED_BMQ */ +diff --git a/kernel/sched/stats.c b/kernel/sched/stats.c +index 750fb3c67eed..0cc040a28d3f 100644 +--- a/kernel/sched/stats.c ++++ b/kernel/sched/stats.c +@@ -22,8 +22,10 @@ static int show_schedstat(struct seq_file *seq, void *v) + } else { + struct rq *rq; + #ifdef CONFIG_SMP ++#ifndef CONFIG_SCHED_BMQ + struct sched_domain *sd; + int dcount = 0; ++#endif + #endif + cpu = (unsigned long)(v - 2); + rq = cpu_rq(cpu); +@@ -40,6 +42,7 @@ static int show_schedstat(struct seq_file *seq, void *v) + seq_printf(seq, "\n"); + + #ifdef CONFIG_SMP ++#ifndef CONFIG_SCHED_BMQ + /* domain-specific stats */ + rcu_read_lock(); + for_each_domain(cpu, sd) { +@@ -68,6 +71,7 @@ static int show_schedstat(struct seq_file *seq, void *v) + sd->ttwu_move_balance); + } + rcu_read_unlock(); ++#endif + #endif + } + return 0; +diff --git a/kernel/sysctl.c b/kernel/sysctl.c +index b6f2f35d0bcf..435440943455 100644 +--- a/kernel/sysctl.c ++++ b/kernel/sysctl.c +@@ -132,6 +132,10 @@ static unsigned long one_ul = 1; + static unsigned long long_max = LONG_MAX; + static int one_hundred = 100; + static int one_thousand = 1000; ++#ifdef CONFIG_SCHED_BMQ ++static int __maybe_unused zero = 0; ++extern int sched_yield_type; ++#endif + #ifdef CONFIG_PRINTK + static int ten_thousand = 10000; + #endif +@@ -300,7 +304,7 @@ static struct ctl_table sysctl_base_table[] = { + { } + }; + +-#ifdef CONFIG_SCHED_DEBUG ++#if defined(CONFIG_SCHED_DEBUG) && !defined(CONFIG_SCHED_BMQ) + static int min_sched_granularity_ns = 100000; /* 100 usecs */ + static int max_sched_granularity_ns = NSEC_PER_SEC; /* 1 second */ + static int min_wakeup_granularity_ns; /* 0 usecs */ +@@ -317,6 +321,7 @@ static int max_extfrag_threshold = 1000; + #endif + + static struct ctl_table kern_table[] = { ++#ifndef CONFIG_SCHED_BMQ + { + .procname = "sched_child_runs_first", + .data = &sysctl_sched_child_runs_first, +@@ -498,6 +503,7 @@ static struct ctl_table kern_table[] = { + .extra2 = SYSCTL_ONE, + }, + #endif ++#endif /* !CONFIG_SCHED_BMQ */ + #ifdef CONFIG_PROVE_LOCKING + { + .procname = "prove_locking", +@@ -1070,6 +1076,17 @@ static struct ctl_table kern_table[] = { + .proc_handler = proc_dointvec, + }, + #endif ++#ifdef CONFIG_SCHED_BMQ ++ { ++ .procname = "yield_type", ++ .data = &sched_yield_type, ++ .maxlen = sizeof (int), ++ .mode = 0644, ++ .proc_handler = &proc_dointvec_minmax, ++ .extra1 = &zero, ++ .extra2 = &two, ++ }, ++#endif + #if defined(CONFIG_S390) && defined(CONFIG_SMP) + { + .procname = "spin_retry", +diff --git a/kernel/time/posix-cpu-timers.c b/kernel/time/posix-cpu-timers.c +index 42d512fcfda2..70b97fe0ff44 100644 +--- a/kernel/time/posix-cpu-timers.c ++++ b/kernel/time/posix-cpu-timers.c +@@ -226,7 +226,7 @@ static void task_sample_cputime(struct task_struct *p, u64 *samples) + u64 stime, utime; + + task_cputime(p, &utime, &stime); +- store_samples(samples, stime, utime, p->se.sum_exec_runtime); ++ store_samples(samples, stime, utime, tsk_seruntime(p)); + } + + static void proc_sample_cputime_atomic(struct task_cputime_atomic *at, +@@ -796,6 +796,7 @@ static void collect_posix_cputimers(struct posix_cputimers *pct, u64 *samples, + } + } + ++#ifndef CONFIG_SCHED_BMQ + static inline void check_dl_overrun(struct task_struct *tsk) + { + if (tsk->dl.dl_overrun) { +@@ -803,6 +804,7 @@ static inline void check_dl_overrun(struct task_struct *tsk) + __group_send_sig_info(SIGXCPU, SEND_SIG_PRIV, tsk); + } + } ++#endif + + static bool check_rlimit(u64 time, u64 limit, int signo, bool rt, bool hard) + { +@@ -830,8 +832,10 @@ static void check_thread_timers(struct task_struct *tsk, + u64 samples[CPUCLOCK_MAX]; + unsigned long soft; + ++#ifndef CONFIG_SCHED_BMQ + if (dl_task(tsk)) + check_dl_overrun(tsk); ++#endif + + if (expiry_cache_is_inactive(pct)) + return; +@@ -845,7 +849,7 @@ static void check_thread_timers(struct task_struct *tsk, + soft = task_rlimit(tsk, RLIMIT_RTTIME); + if (soft != RLIM_INFINITY) { + /* Task RT timeout is accounted in jiffies. RTTIME is usec */ +- unsigned long rttime = tsk->rt.timeout * (USEC_PER_SEC / HZ); ++ unsigned long rttime = tsk_rttimeout(tsk) * (USEC_PER_SEC / HZ); + unsigned long hard = task_rlimit_max(tsk, RLIMIT_RTTIME); + + /* At the hard limit, send SIGKILL. No further action. */ +@@ -1099,8 +1103,10 @@ static inline bool fastpath_timer_check(struct task_struct *tsk) + return true; + } + ++#ifndef CONFIG_SCHED_BMQ + if (dl_task(tsk) && tsk->dl.dl_overrun) + return true; ++#endif + + return false; + } +diff --git a/kernel/trace/trace_selftest.c b/kernel/trace/trace_selftest.c +index 69ee8ef12cee..208788fcbb0e 100644 +--- a/kernel/trace/trace_selftest.c ++++ b/kernel/trace/trace_selftest.c +@@ -1048,10 +1048,15 @@ static int trace_wakeup_test_thread(void *data) + { + /* Make this a -deadline thread */ + static const struct sched_attr attr = { ++#ifdef CONFIG_SCHED_BMQ ++ /* No deadline on BMQ, use RR */ ++ .sched_policy = SCHED_RR, ++#else + .sched_policy = SCHED_DEADLINE, + .sched_runtime = 100000ULL, + .sched_deadline = 10000000ULL, + .sched_period = 10000000ULL ++#endif + }; + struct wakeup_test_data *x = data; + diff --git a/linux54-tkg/linux54-tkg-patches/0009-glitched-bmq.patch b/linux54-tkg/linux54-tkg-patches/0009-glitched-bmq.patch new file mode 100644 index 0000000..5e78811 --- /dev/null +++ b/linux54-tkg/linux54-tkg-patches/0009-glitched-bmq.patch @@ -0,0 +1,108 @@ +From f7f49141a5dbe9c99d78196b58c44307fb2e6be3 Mon Sep 17 00:00:00 2001 +From: Tk-Glitch +Date: Wed, 4 Jul 2018 04:30:08 +0200 +Subject: glitched - BMQ + +diff --git a/drivers/cpufreq/cpufreq_ondemand.c b/drivers/cpufreq/cpufreq_ondemand.c +index 6b423eebfd5d..61e3271675d6 100644 +--- a/drivers/cpufreq/cpufreq_ondemand.c ++++ b/drivers/cpufreq/cpufreq_ondemand.c +@@ -21,10 +21,10 @@ + #include "cpufreq_ondemand.h" + + /* On-demand governor macros */ +-#define DEF_FREQUENCY_UP_THRESHOLD (63) +-#define DEF_SAMPLING_DOWN_FACTOR (1) ++#define DEF_FREQUENCY_UP_THRESHOLD (55) ++#define DEF_SAMPLING_DOWN_FACTOR (5) + #define MAX_SAMPLING_DOWN_FACTOR (100000) +-#define MICRO_FREQUENCY_UP_THRESHOLD (95) ++#define MICRO_FREQUENCY_UP_THRESHOLD (63) + #define MICRO_FREQUENCY_MIN_SAMPLE_RATE (10000) + #define MIN_FREQUENCY_UP_THRESHOLD (1) + #define MAX_FREQUENCY_UP_THRESHOLD (100) +diff --git a/kernel/Kconfig.hz b/kernel/Kconfig.hz +index 2a202a846757..1d9c7ed79b11 100644 +--- a/kernel/Kconfig.hz ++++ b/kernel/Kconfig.hz +@@ -4,7 +4,7 @@ + + choice + prompt "Timer frequency" +- default HZ_250 ++ default HZ_500 + help + Allows the configuration of the timer frequency. It is customary + to have the timer interrupt run at 1000 Hz but 100 Hz may be more +@@ -39,6 +39,13 @@ choice + on SMP and NUMA systems and exactly dividing by both PAL and + NTSC frame rates for video and multimedia work. + ++ config HZ_500 ++ bool "500 HZ" ++ help ++ 500 Hz is a balanced timer frequency. Provides fast interactivity ++ on desktops with great smoothness without increasing CPU power ++ consumption and sacrificing the battery life on laptops. ++ + config HZ_1000 + bool "1000 HZ" + help +@@ -52,6 +59,7 @@ config HZ + default 100 if HZ_100 + default 250 if HZ_250 + default 300 if HZ_300 ++ default 500 if HZ_500 + default 1000 if HZ_1000 + + config SCHED_HRTICK + +diff --git a/kernel/Kconfig.hz b/kernel/Kconfig.hz +index 2a202a846757..1d9c7ed79b11 100644 +--- a/kernel/Kconfig.hz ++++ b/kernel/Kconfig.hz +@@ -4,7 +4,7 @@ + + choice + prompt "Timer frequency" +- default HZ_500 ++ default HZ_750 + help + Allows the configuration of the timer frequency. It is customary + to have the timer interrupt run at 1000 Hz but 100 Hz may be more +@@ -46,6 +46,13 @@ choice + on desktops with great smoothness without increasing CPU power + consumption and sacrificing the battery life on laptops. + ++ config HZ_750 ++ bool "750 HZ" ++ help ++ 750 Hz is a good timer frequency for desktops. Provides fast ++ interactivity with great smoothness without sacrificing too ++ much throughput. ++ + config HZ_1000 + bool "1000 HZ" + help +@@ -60,6 +67,7 @@ config HZ + default 250 if HZ_250 + default 300 if HZ_300 + default 500 if HZ_500 ++ default 750 if HZ_750 + default 1000 if HZ_1000 + + config SCHED_HRTICK + +diff --git a/mm/vmscan.c b/mm/vmscan.c +index 9270a4370d54..30d01e647417 100644 +--- a/mm/vmscan.c ++++ b/mm/vmscan.c +@@ -159,7 +159,7 @@ struct scan_control { + /* + * From 0 .. 100. Higher means more swappy. + */ +-int vm_swappiness = 60; ++int vm_swappiness = 20; + /* + * The total number of pages which are beyond the high watermark within all + * zones. diff --git a/linux54-tkg/linux54-tkg-patches/0011-ZFS-fix.patch b/linux54-tkg/linux54-tkg-patches/0011-ZFS-fix.patch new file mode 100644 index 0000000..af71d04 --- /dev/null +++ b/linux54-tkg/linux54-tkg-patches/0011-ZFS-fix.patch @@ -0,0 +1,43 @@ +From 1e010beda2896bdf3082fb37a3e49f8ce20e04d8 Mon Sep 17 00:00:00 2001 +From: =?UTF-8?q?J=C3=B6rg=20Thalheim?= +Date: Thu, 2 May 2019 05:28:08 +0100 +Subject: [PATCH] x86/fpu: Export kernel_fpu_{begin,end}() with + EXPORT_SYMBOL_GPL +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +We need these symbols in zfs as the fpu implementation breaks userspace: + +https://github.com/zfsonlinux/zfs/issues/9346 +Signed-off-by: Jörg Thalheim +--- + arch/x86/kernel/fpu/core.c | 4 ++-- + 1 file changed, 2 insertions(+), 2 deletions(-) + +diff --git a/arch/x86/kernel/fpu/core.c b/arch/x86/kernel/fpu/core.c +index 12c70840980e..352538b3bb5d 100644 +--- a/arch/x86/kernel/fpu/core.c ++++ b/arch/x86/kernel/fpu/core.c +@@ -102,7 +102,7 @@ void kernel_fpu_begin(void) + } + __cpu_invalidate_fpregs_state(); + } +-EXPORT_SYMBOL_GPL(kernel_fpu_begin); ++EXPORT_SYMBOL(kernel_fpu_begin); + + void kernel_fpu_end(void) + { +@@ -111,7 +111,7 @@ void kernel_fpu_end(void) + this_cpu_write(in_kernel_fpu, false); + preempt_enable(); + } +-EXPORT_SYMBOL_GPL(kernel_fpu_end); ++EXPORT_SYMBOL(kernel_fpu_end); + + /* + * Save the FPU state (mark it for reload if necessary): +-- +2.23.0 + + diff --git a/linux54-tkg/linux54-tkg-patches/0012-linux-hardened.patch b/linux54-tkg/linux54-tkg-patches/0012-linux-hardened.patch new file mode 100644 index 0000000..b50ec74 --- /dev/null +++ b/linux54-tkg/linux54-tkg-patches/0012-linux-hardened.patch @@ -0,0 +1,2806 @@ +diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt +index 5594c8bf1dcd..ac80978f4629 100644 +--- a/Documentation/admin-guide/kernel-parameters.txt ++++ b/Documentation/admin-guide/kernel-parameters.txt +@@ -505,16 +505,6 @@ + nosocket -- Disable socket memory accounting. + nokmem -- Disable kernel memory accounting. + +- checkreqprot [SELINUX] Set initial checkreqprot flag value. +- Format: { "0" | "1" } +- See security/selinux/Kconfig help text. +- 0 -- check protection applied by kernel (includes +- any implied execute protection). +- 1 -- check protection requested by application. +- Default value is set via a kernel config option. +- Value can be changed at runtime via +- /selinux/checkreqprot. +- + cio_ignore= [S390] + See Documentation/s390/common_io.rst for details. + clk_ignore_unused +@@ -3345,6 +3335,11 @@ + the specified number of seconds. This is to be used if + your oopses keep scrolling off the screen. + ++ extra_latent_entropy ++ Enable a very simple form of latent entropy extraction ++ from the first 4GB of memory as the bootmem allocator ++ passes the memory pages to the buddy allocator. ++ + pcbit= [HW,ISDN] + + pcd. [PARIDE] +diff --git a/Documentation/admin-guide/sysctl/kernel.rst b/Documentation/admin-guide/sysctl/kernel.rst +index 032c7cd3cede..cc3491b05976 100644 +--- a/Documentation/admin-guide/sysctl/kernel.rst ++++ b/Documentation/admin-guide/sysctl/kernel.rst +@@ -102,6 +102,7 @@ show up in /proc/sys/kernel: + - sysctl_writes_strict + - tainted ==> Documentation/admin-guide/tainted-kernels.rst + - threads-max ++- tiocsti_restrict + - unknown_nmi_panic + - watchdog + - watchdog_thresh +@@ -1114,6 +1115,25 @@ thread structures would occupy too much (more than 1/8th) of the + available RAM pages threads-max is reduced accordingly. + + ++tiocsti_restrict: ++================= ++ ++This toggle indicates whether unprivileged users are prevented from using the ++TIOCSTI ioctl to inject commands into other processes which share a tty ++session. ++ ++When tiocsti_restrict is set to (0) there are no restrictions(accept the ++default restriction of only being able to injection commands into one's own ++tty). When tiocsti_restrict is set to (1), users must have CAP_SYS_ADMIN to ++use the TIOCSTI ioctl. ++ ++When user namespaces are in use, the check for the capability CAP_SYS_ADMIN is ++done against the user namespace that originally opened the tty. ++ ++The kernel config option CONFIG_SECURITY_TIOCSTI_RESTRICT sets the default ++value of tiocsti_restrict. ++ ++ + unknown_nmi_panic: + ================== + +diff --git a/arch/Kconfig b/arch/Kconfig +index 5f8a5d84dbbe..60103a76d33e 100644 +--- a/arch/Kconfig ++++ b/arch/Kconfig +@@ -653,7 +653,7 @@ config ARCH_MMAP_RND_BITS + int "Number of bits to use for ASLR of mmap base address" if EXPERT + range ARCH_MMAP_RND_BITS_MIN ARCH_MMAP_RND_BITS_MAX + default ARCH_MMAP_RND_BITS_DEFAULT if ARCH_MMAP_RND_BITS_DEFAULT +- default ARCH_MMAP_RND_BITS_MIN ++ default ARCH_MMAP_RND_BITS_MAX + depends on HAVE_ARCH_MMAP_RND_BITS + help + This value can be used to select the number of bits to use to +@@ -687,7 +687,7 @@ config ARCH_MMAP_RND_COMPAT_BITS + int "Number of bits to use for ASLR of mmap base address for compatible applications" if EXPERT + range ARCH_MMAP_RND_COMPAT_BITS_MIN ARCH_MMAP_RND_COMPAT_BITS_MAX + default ARCH_MMAP_RND_COMPAT_BITS_DEFAULT if ARCH_MMAP_RND_COMPAT_BITS_DEFAULT +- default ARCH_MMAP_RND_COMPAT_BITS_MIN ++ default ARCH_MMAP_RND_COMPAT_BITS_MAX + depends on HAVE_ARCH_MMAP_RND_COMPAT_BITS + help + This value can be used to select the number of bits to use to +@@ -906,6 +906,7 @@ config ARCH_HAS_REFCOUNT + + config REFCOUNT_FULL + bool "Perform full reference count validation at the expense of speed" ++ default y + help + Enabling this switches the refcounting infrastructure from a fast + unchecked atomic_t implementation to a fully state checked +diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig +index 6ccd2ed30963..56d39ec3c2c3 100644 +--- a/arch/arm64/Kconfig ++++ b/arch/arm64/Kconfig +@@ -1139,6 +1139,7 @@ config RODATA_FULL_DEFAULT_ENABLED + + config ARM64_SW_TTBR0_PAN + bool "Emulate Privileged Access Never using TTBR0_EL1 switching" ++ default y + help + Enabling this option prevents the kernel from accessing + user-space memory directly by pointing TTBR0_EL1 to a reserved +@@ -1538,6 +1539,7 @@ config RANDOMIZE_BASE + bool "Randomize the address of the kernel image" + select ARM64_MODULE_PLTS if MODULES + select RELOCATABLE ++ default y + help + Randomizes the virtual address at which the kernel image is + loaded, as a security feature that deters exploit attempts +diff --git a/arch/arm64/Kconfig.debug b/arch/arm64/Kconfig.debug +index cf09010d825f..dc4083ceff57 100644 +--- a/arch/arm64/Kconfig.debug ++++ b/arch/arm64/Kconfig.debug +@@ -43,6 +43,7 @@ config ARM64_RANDOMIZE_TEXT_OFFSET + config DEBUG_WX + bool "Warn on W+X mappings at boot" + select ARM64_PTDUMP_CORE ++ default y + ---help--- + Generate a warning if any W+X mappings are found at boot. + +diff --git a/arch/arm64/configs/defconfig b/arch/arm64/configs/defconfig +index c9a867ac32d4..5c4d264f6a6e 100644 +--- a/arch/arm64/configs/defconfig ++++ b/arch/arm64/configs/defconfig +@@ -1,4 +1,3 @@ +-CONFIG_SYSVIPC=y + CONFIG_POSIX_MQUEUE=y + CONFIG_AUDIT=y + CONFIG_NO_HZ_IDLE=y +diff --git a/arch/arm64/include/asm/elf.h b/arch/arm64/include/asm/elf.h +index b618017205a3..0a228dbcad65 100644 +--- a/arch/arm64/include/asm/elf.h ++++ b/arch/arm64/include/asm/elf.h +@@ -103,14 +103,10 @@ + + /* + * This is the base location for PIE (ET_DYN with INTERP) loads. On +- * 64-bit, this is above 4GB to leave the entire 32-bit address ++ * 64-bit, this is raised to 4GB to leave the entire 32-bit address + * space open for things that want to use the area for 32-bit pointers. + */ +-#ifdef CONFIG_ARM64_FORCE_52BIT +-#define ELF_ET_DYN_BASE (2 * TASK_SIZE_64 / 3) +-#else +-#define ELF_ET_DYN_BASE (2 * DEFAULT_MAP_WINDOW_64 / 3) +-#endif /* CONFIG_ARM64_FORCE_52BIT */ ++#define ELF_ET_DYN_BASE 0x100000000UL + + #ifndef __ASSEMBLY__ + +@@ -164,10 +160,10 @@ extern int arch_setup_additional_pages(struct linux_binprm *bprm, + /* 1GB of VA */ + #ifdef CONFIG_COMPAT + #define STACK_RND_MASK (test_thread_flag(TIF_32BIT) ? \ +- 0x7ff >> (PAGE_SHIFT - 12) : \ +- 0x3ffff >> (PAGE_SHIFT - 12)) ++ ((1UL << mmap_rnd_compat_bits) - 1) >> (PAGE_SHIFT - 12) : \ ++ ((1UL << mmap_rnd_bits) - 1) >> (PAGE_SHIFT - 12)) + #else +-#define STACK_RND_MASK (0x3ffff >> (PAGE_SHIFT - 12)) ++#define STACK_RND_MASK (((1UL << mmap_rnd_bits) - 1) >> (PAGE_SHIFT - 12)) + #endif + + #ifdef __AARCH64EB__ +diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig +index 8ef85139553f..e16076b30625 100644 +--- a/arch/x86/Kconfig ++++ b/arch/x86/Kconfig +@@ -1219,8 +1219,7 @@ config VM86 + default X86_LEGACY_VM86 + + config X86_16BIT +- bool "Enable support for 16-bit segments" if EXPERT +- default y ++ bool "Enable support for 16-bit segments" + depends on MODIFY_LDT_SYSCALL + ---help--- + This option is required by programs like Wine to run 16-bit +@@ -2365,7 +2364,7 @@ config COMPAT_VDSO + choice + prompt "vsyscall table for legacy applications" + depends on X86_64 +- default LEGACY_VSYSCALL_XONLY ++ default LEGACY_VSYSCALL_NONE + help + Legacy user code that does not know how to find the vDSO expects + to be able to issue three syscalls by calling fixed addresses in +@@ -2461,8 +2460,7 @@ config CMDLINE_OVERRIDE + be set to 'N' under normal conditions. + + config MODIFY_LDT_SYSCALL +- bool "Enable the LDT (local descriptor table)" if EXPERT +- default y ++ bool "Enable the LDT (local descriptor table)" + ---help--- + Linux can allow user programs to install a per-process x86 + Local Descriptor Table (LDT) using the modify_ldt(2) system +diff --git a/arch/x86/Kconfig.debug b/arch/x86/Kconfig.debug +index bf9cd83de777..13ef90f3de52 100644 +--- a/arch/x86/Kconfig.debug ++++ b/arch/x86/Kconfig.debug +@@ -91,6 +91,7 @@ config EFI_PGT_DUMP + config DEBUG_WX + bool "Warn on W+X mappings at boot" + select X86_PTDUMP_CORE ++ default y + ---help--- + Generate a warning if any W+X mappings are found at boot. + +diff --git a/arch/x86/configs/x86_64_defconfig b/arch/x86/configs/x86_64_defconfig +index d0a5ffeae8df..2a91d4a9f640 100644 +--- a/arch/x86/configs/x86_64_defconfig ++++ b/arch/x86/configs/x86_64_defconfig +@@ -1,5 +1,4 @@ + # CONFIG_LOCALVERSION_AUTO is not set +-CONFIG_SYSVIPC=y + CONFIG_POSIX_MQUEUE=y + CONFIG_BSD_PROCESS_ACCT=y + CONFIG_TASKSTATS=y +diff --git a/arch/x86/entry/vdso/vma.c b/arch/x86/entry/vdso/vma.c +index f5937742b290..6655ce228e25 100644 +--- a/arch/x86/entry/vdso/vma.c ++++ b/arch/x86/entry/vdso/vma.c +@@ -198,55 +198,9 @@ static int map_vdso(const struct vdso_image *image, unsigned long addr) + } + + #ifdef CONFIG_X86_64 +-/* +- * Put the vdso above the (randomized) stack with another randomized +- * offset. This way there is no hole in the middle of address space. +- * To save memory make sure it is still in the same PTE as the stack +- * top. This doesn't give that many random bits. +- * +- * Note that this algorithm is imperfect: the distribution of the vdso +- * start address within a PMD is biased toward the end. +- * +- * Only used for the 64-bit and x32 vdsos. +- */ +-static unsigned long vdso_addr(unsigned long start, unsigned len) +-{ +- unsigned long addr, end; +- unsigned offset; +- +- /* +- * Round up the start address. It can start out unaligned as a result +- * of stack start randomization. +- */ +- start = PAGE_ALIGN(start); +- +- /* Round the lowest possible end address up to a PMD boundary. */ +- end = (start + len + PMD_SIZE - 1) & PMD_MASK; +- if (end >= TASK_SIZE_MAX) +- end = TASK_SIZE_MAX; +- end -= len; +- +- if (end > start) { +- offset = get_random_int() % (((end - start) >> PAGE_SHIFT) + 1); +- addr = start + (offset << PAGE_SHIFT); +- } else { +- addr = start; +- } +- +- /* +- * Forcibly align the final address in case we have a hardware +- * issue that requires alignment for performance reasons. +- */ +- addr = align_vdso_addr(addr); +- +- return addr; +-} +- + static int map_vdso_randomized(const struct vdso_image *image) + { +- unsigned long addr = vdso_addr(current->mm->start_stack, image->size-image->sym_vvar_start); +- +- return map_vdso(image, addr); ++ return map_vdso(image, 0); + } + #endif + +diff --git a/arch/x86/include/asm/elf.h b/arch/x86/include/asm/elf.h +index 69c0f892e310..f9f7a85bb71e 100644 +--- a/arch/x86/include/asm/elf.h ++++ b/arch/x86/include/asm/elf.h +@@ -248,11 +248,11 @@ extern int force_personality32; + + /* + * This is the base location for PIE (ET_DYN with INTERP) loads. On +- * 64-bit, this is above 4GB to leave the entire 32-bit address ++ * 64-bit, this is raised to 4GB to leave the entire 32-bit address + * space open for things that want to use the area for 32-bit pointers. + */ + #define ELF_ET_DYN_BASE (mmap_is_ia32() ? 0x000400000UL : \ +- (DEFAULT_MAP_WINDOW / 3 * 2)) ++ 0x100000000UL) + + /* This yields a mask that user programs can use to figure out what + instruction set this CPU supports. This could be done in user space, +@@ -312,8 +312,8 @@ extern bool mmap_address_hint_valid(unsigned long addr, unsigned long len); + + #ifdef CONFIG_X86_32 + +-#define __STACK_RND_MASK(is32bit) (0x7ff) +-#define STACK_RND_MASK (0x7ff) ++#define __STACK_RND_MASK(is32bit) ((1UL << mmap_rnd_bits) - 1) ++#define STACK_RND_MASK ((1UL << mmap_rnd_bits) - 1) + + #define ARCH_DLINFO ARCH_DLINFO_IA32 + +@@ -322,7 +322,11 @@ extern bool mmap_address_hint_valid(unsigned long addr, unsigned long len); + #else /* CONFIG_X86_32 */ + + /* 1GB for 64bit, 8MB for 32bit */ +-#define __STACK_RND_MASK(is32bit) ((is32bit) ? 0x7ff : 0x3fffff) ++#ifdef CONFIG_COMPAT ++#define __STACK_RND_MASK(is32bit) ((is32bit) ? (1UL << mmap_rnd_compat_bits) - 1 : (1UL << mmap_rnd_bits) - 1) ++#else ++#define __STACK_RND_MASK(is32bit) ((1UL << mmap_rnd_bits) - 1) ++#endif + #define STACK_RND_MASK __STACK_RND_MASK(mmap_is_ia32()) + + #define ARCH_DLINFO \ +@@ -380,5 +384,4 @@ struct va_alignment { + } ____cacheline_aligned; + + extern struct va_alignment va_align; +-extern unsigned long align_vdso_addr(unsigned long); + #endif /* _ASM_X86_ELF_H */ +diff --git a/arch/x86/include/asm/tlbflush.h b/arch/x86/include/asm/tlbflush.h +index 6f66d841262d..b786e7cb395d 100644 +--- a/arch/x86/include/asm/tlbflush.h ++++ b/arch/x86/include/asm/tlbflush.h +@@ -295,6 +295,7 @@ static inline void cr4_set_bits_irqsoff(unsigned long mask) + unsigned long cr4; + + cr4 = this_cpu_read(cpu_tlbstate.cr4); ++ BUG_ON(cr4 != __read_cr4()); + if ((cr4 | mask) != cr4) + __cr4_set(cr4 | mask); + } +@@ -305,6 +306,7 @@ static inline void cr4_clear_bits_irqsoff(unsigned long mask) + unsigned long cr4; + + cr4 = this_cpu_read(cpu_tlbstate.cr4); ++ BUG_ON(cr4 != __read_cr4()); + if ((cr4 & ~mask) != cr4) + __cr4_set(cr4 & ~mask); + } +@@ -334,6 +336,7 @@ static inline void cr4_toggle_bits_irqsoff(unsigned long mask) + unsigned long cr4; + + cr4 = this_cpu_read(cpu_tlbstate.cr4); ++ BUG_ON(cr4 != __read_cr4()); + __cr4_set(cr4 ^ mask); + } + +@@ -440,6 +443,7 @@ static inline void __native_flush_tlb_global(void) + raw_local_irq_save(flags); + + cr4 = this_cpu_read(cpu_tlbstate.cr4); ++ BUG_ON(cr4 != __read_cr4()); + /* toggle PGE */ + native_write_cr4(cr4 ^ X86_CR4_PGE); + /* write old PGE again and flush TLBs */ +diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c +index fffe21945374..e9e124eb6ccb 100644 +--- a/arch/x86/kernel/cpu/common.c ++++ b/arch/x86/kernel/cpu/common.c +@@ -1854,7 +1854,6 @@ void cpu_init(void) + wrmsrl(MSR_KERNEL_GS_BASE, 0); + barrier(); + +- x86_configure_nx(); + x2apic_setup(); + + /* +diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c +index 5e94c4354d4e..093bd8ad1130 100644 +--- a/arch/x86/kernel/process.c ++++ b/arch/x86/kernel/process.c +@@ -42,6 +42,8 @@ + #include + #include + #include ++#include ++#include + + #include "process.h" + +@@ -798,7 +800,10 @@ unsigned long arch_align_stack(unsigned long sp) + + unsigned long arch_randomize_brk(struct mm_struct *mm) + { +- return randomize_page(mm->brk, 0x02000000); ++ if (mmap_is_ia32()) ++ return mm->brk + get_random_long() % SZ_32M + PAGE_SIZE; ++ else ++ return mm->brk + get_random_long() % SZ_1G + PAGE_SIZE; + } + + /* +diff --git a/arch/x86/kernel/sys_x86_64.c b/arch/x86/kernel/sys_x86_64.c +index f7476ce23b6e..652169a2b23a 100644 +--- a/arch/x86/kernel/sys_x86_64.c ++++ b/arch/x86/kernel/sys_x86_64.c +@@ -54,13 +54,6 @@ static unsigned long get_align_bits(void) + return va_align.bits & get_align_mask(); + } + +-unsigned long align_vdso_addr(unsigned long addr) +-{ +- unsigned long align_mask = get_align_mask(); +- addr = (addr + align_mask) & ~align_mask; +- return addr | get_align_bits(); +-} +- + static int __init control_va_addr_alignment(char *str) + { + /* guard against enabling this on other CPU families */ +@@ -122,10 +115,7 @@ static void find_start_end(unsigned long addr, unsigned long flags, + } + + *begin = get_mmap_base(1); +- if (in_32bit_syscall()) +- *end = task_size_32bit(); +- else +- *end = task_size_64bit(addr > DEFAULT_MAP_WINDOW); ++ *end = get_mmap_base(0); + } + + unsigned long +@@ -210,7 +200,7 @@ arch_get_unmapped_area_topdown(struct file *filp, const unsigned long addr0, + + info.flags = VM_UNMAPPED_AREA_TOPDOWN; + info.length = len; +- info.low_limit = PAGE_SIZE; ++ info.low_limit = get_mmap_base(1); + info.high_limit = get_mmap_base(0); + + /* +diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c +index 0a74407ef92e..5ceff405c81c 100644 +--- a/arch/x86/mm/init_32.c ++++ b/arch/x86/mm/init_32.c +@@ -560,9 +560,9 @@ static void __init pagetable_init(void) + + #define DEFAULT_PTE_MASK ~(_PAGE_NX | _PAGE_GLOBAL) + /* Bits supported by the hardware: */ +-pteval_t __supported_pte_mask __read_mostly = DEFAULT_PTE_MASK; ++pteval_t __supported_pte_mask __ro_after_init = DEFAULT_PTE_MASK; + /* Bits allowed in normal kernel mappings: */ +-pteval_t __default_kernel_pte_mask __read_mostly = DEFAULT_PTE_MASK; ++pteval_t __default_kernel_pte_mask __ro_after_init = DEFAULT_PTE_MASK; + EXPORT_SYMBOL_GPL(__supported_pte_mask); + /* Used in PAGE_KERNEL_* macros which are reasonably used out-of-tree: */ + EXPORT_SYMBOL(__default_kernel_pte_mask); +diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c +index b8541d77452c..a231504e0348 100644 +--- a/arch/x86/mm/init_64.c ++++ b/arch/x86/mm/init_64.c +@@ -97,9 +97,9 @@ DEFINE_ENTRY(pte, pte, init) + */ + + /* Bits supported by the hardware: */ +-pteval_t __supported_pte_mask __read_mostly = ~0; ++pteval_t __supported_pte_mask __ro_after_init = ~0; + /* Bits allowed in normal kernel mappings: */ +-pteval_t __default_kernel_pte_mask __read_mostly = ~0; ++pteval_t __default_kernel_pte_mask __ro_after_init = ~0; + EXPORT_SYMBOL_GPL(__supported_pte_mask); + /* Used in PAGE_KERNEL_* macros which are reasonably used out-of-tree: */ + EXPORT_SYMBOL(__default_kernel_pte_mask); +diff --git a/block/blk-softirq.c b/block/blk-softirq.c +index 457d9ba3eb20..5f987fc1c0a0 100644 +--- a/block/blk-softirq.c ++++ b/block/blk-softirq.c +@@ -20,7 +20,7 @@ static DEFINE_PER_CPU(struct list_head, blk_cpu_done); + * Softirq action handler - move entries to local list and loop over them + * while passing them to the queue registered handler. + */ +-static __latent_entropy void blk_done_softirq(struct softirq_action *h) ++static __latent_entropy void blk_done_softirq(void) + { + struct list_head *cpu_list, local_list; + +diff --git a/drivers/ata/libata-core.c b/drivers/ata/libata-core.c +index 84b183a6424e..b83bff5e9ab5 100644 +--- a/drivers/ata/libata-core.c ++++ b/drivers/ata/libata-core.c +@@ -5143,7 +5143,7 @@ void ata_qc_free(struct ata_queued_cmd *qc) + struct ata_port *ap; + unsigned int tag; + +- WARN_ON_ONCE(qc == NULL); /* ata_qc_from_tag _might_ return NULL */ ++ BUG_ON(qc == NULL); /* ata_qc_from_tag _might_ return NULL */ + ap = qc->ap; + + qc->flags = 0; +@@ -5160,7 +5160,7 @@ void __ata_qc_complete(struct ata_queued_cmd *qc) + struct ata_port *ap; + struct ata_link *link; + +- WARN_ON_ONCE(qc == NULL); /* ata_qc_from_tag _might_ return NULL */ ++ BUG_ON(qc == NULL); /* ata_qc_from_tag _might_ return NULL */ + WARN_ON_ONCE(!(qc->flags & ATA_QCFLAG_ACTIVE)); + ap = qc->ap; + link = qc->dev->link; +diff --git a/drivers/char/Kconfig b/drivers/char/Kconfig +index df0fc997dc3e..bd8eed8de6c1 100644 +--- a/drivers/char/Kconfig ++++ b/drivers/char/Kconfig +@@ -9,7 +9,6 @@ source "drivers/tty/Kconfig" + + config DEVMEM + bool "/dev/mem virtual device support" +- default y + help + Say Y here if you want to support the /dev/mem device. + The /dev/mem device is used to access areas of physical +@@ -514,7 +513,6 @@ config TELCLOCK + config DEVPORT + bool "/dev/port character device" + depends on ISA || PCI +- default y + help + Say Y here if you want to support the /dev/port device. The /dev/port + device is similar to /dev/mem, but for I/O ports. +diff --git a/drivers/tty/Kconfig b/drivers/tty/Kconfig +index c7623f99ac0f..859c2782c8e2 100644 +--- a/drivers/tty/Kconfig ++++ b/drivers/tty/Kconfig +@@ -122,7 +122,6 @@ config UNIX98_PTYS + + config LEGACY_PTYS + bool "Legacy (BSD) PTY support" +- default y + ---help--- + A pseudo terminal (PTY) is a software device consisting of two + halves: a master and a slave. The slave device behaves identical to +diff --git a/drivers/tty/tty_io.c b/drivers/tty/tty_io.c +index 802c1210558f..0cc320f33cdc 100644 +--- a/drivers/tty/tty_io.c ++++ b/drivers/tty/tty_io.c +@@ -173,6 +173,7 @@ static void free_tty_struct(struct tty_struct *tty) + put_device(tty->dev); + kfree(tty->write_buf); + tty->magic = 0xDEADDEAD; ++ put_user_ns(tty->owner_user_ns); + kfree(tty); + } + +@@ -2180,11 +2181,19 @@ static int tty_fasync(int fd, struct file *filp, int on) + * FIXME: may race normal receive processing + */ + ++int tiocsti_restrict = IS_ENABLED(CONFIG_SECURITY_TIOCSTI_RESTRICT); ++ + static int tiocsti(struct tty_struct *tty, char __user *p) + { + char ch, mbz = 0; + struct tty_ldisc *ld; + ++ if (tiocsti_restrict && ++ !ns_capable(tty->owner_user_ns, CAP_SYS_ADMIN)) { ++ dev_warn_ratelimited(tty->dev, ++ "Denied TIOCSTI ioctl for non-privileged process\n"); ++ return -EPERM; ++ } + if ((current->signal->tty != tty) && !capable(CAP_SYS_ADMIN)) + return -EPERM; + if (get_user(ch, p)) +@@ -3004,6 +3013,7 @@ struct tty_struct *alloc_tty_struct(struct tty_driver *driver, int idx) + tty->index = idx; + tty_line_name(driver, idx, tty->name); + tty->dev = tty_get_device(tty); ++ tty->owner_user_ns = get_user_ns(current_user_ns()); + + return tty; + } +diff --git a/drivers/usb/core/hub.c b/drivers/usb/core/hub.c +index 4ac74b354801..7c2cb5b3a449 100644 +--- a/drivers/usb/core/hub.c ++++ b/drivers/usb/core/hub.c +@@ -42,6 +42,8 @@ + #define USB_TP_TRANSMISSION_DELAY 40 /* ns */ + #define USB_TP_TRANSMISSION_DELAY_MAX 65535 /* ns */ + ++extern int deny_new_usb; ++ + /* Protect struct usb_device->state and ->children members + * Note: Both are also protected by ->dev.sem, except that ->state can + * change to USB_STATE_NOTATTACHED even when the semaphore isn't held. */ +@@ -4991,6 +4993,12 @@ static void hub_port_connect(struct usb_hub *hub, int port1, u16 portstatus, + goto done; + return; + } ++ ++ if (deny_new_usb) { ++ dev_err(&port_dev->dev, "denied insert of USB device on port %d\n", port1); ++ goto done; ++ } ++ + if (hub_is_superspeed(hub->hdev)) + unit_load = 150; + else +diff --git a/fs/exec.c b/fs/exec.c +index c27231234764..4038334db213 100644 +--- a/fs/exec.c ++++ b/fs/exec.c +@@ -63,6 +63,7 @@ + #include + #include + #include ++#include + + #include + #include +@@ -276,6 +277,8 @@ static int __bprm_mm_init(struct linux_binprm *bprm) + arch_bprm_mm_init(mm, vma); + up_write(&mm->mmap_sem); + bprm->p = vma->vm_end - sizeof(void *); ++ if (randomize_va_space) ++ bprm->p ^= get_random_int() & ~PAGE_MASK; + return 0; + err: + up_write(&mm->mmap_sem); +diff --git a/fs/namei.c b/fs/namei.c +index 671c3c1a3425..618ef0b5d000 100644 +--- a/fs/namei.c ++++ b/fs/namei.c +@@ -877,10 +877,10 @@ static inline void put_link(struct nameidata *nd) + path_put(&last->link); + } + +-int sysctl_protected_symlinks __read_mostly = 0; +-int sysctl_protected_hardlinks __read_mostly = 0; +-int sysctl_protected_fifos __read_mostly; +-int sysctl_protected_regular __read_mostly; ++int sysctl_protected_symlinks __read_mostly = 1; ++int sysctl_protected_hardlinks __read_mostly = 1; ++int sysctl_protected_fifos __read_mostly = 2; ++int sysctl_protected_regular __read_mostly = 2; + + /** + * may_follow_link - Check symlink following for unsafe situations +diff --git a/fs/nfs/Kconfig b/fs/nfs/Kconfig +index 295a7a21b774..3aed361bc0f9 100644 +--- a/fs/nfs/Kconfig ++++ b/fs/nfs/Kconfig +@@ -195,4 +195,3 @@ config NFS_DEBUG + bool + depends on NFS_FS && SUNRPC_DEBUG + select CRC32 +- default y +diff --git a/fs/proc/Kconfig b/fs/proc/Kconfig +index cb5629bd5fff..bc44606fcc48 100644 +--- a/fs/proc/Kconfig ++++ b/fs/proc/Kconfig +@@ -41,7 +41,6 @@ config PROC_KCORE + config PROC_VMCORE + bool "/proc/vmcore support" + depends on PROC_FS && CRASH_DUMP +- default y + help + Exports the dump image of crashed kernel in ELF format. + +diff --git a/fs/stat.c b/fs/stat.c +index c38e4c2e1221..6135fbaf7298 100644 +--- a/fs/stat.c ++++ b/fs/stat.c +@@ -40,8 +40,13 @@ void generic_fillattr(struct inode *inode, struct kstat *stat) + stat->gid = inode->i_gid; + stat->rdev = inode->i_rdev; + stat->size = i_size_read(inode); +- stat->atime = inode->i_atime; +- stat->mtime = inode->i_mtime; ++ if (is_sidechannel_device(inode) && !capable_noaudit(CAP_MKNOD)) { ++ stat->atime = inode->i_ctime; ++ stat->mtime = inode->i_ctime; ++ } else { ++ stat->atime = inode->i_atime; ++ stat->mtime = inode->i_mtime; ++ } + stat->ctime = inode->i_ctime; + stat->blksize = i_blocksize(inode); + stat->blocks = inode->i_blocks; +@@ -77,9 +82,14 @@ int vfs_getattr_nosec(const struct path *path, struct kstat *stat, + if (IS_AUTOMOUNT(inode)) + stat->attributes |= STATX_ATTR_AUTOMOUNT; + +- if (inode->i_op->getattr) +- return inode->i_op->getattr(path, stat, request_mask, +- query_flags); ++ if (inode->i_op->getattr) { ++ int retval = inode->i_op->getattr(path, stat, request_mask, query_flags); ++ if (!retval && is_sidechannel_device(inode) && !capable_noaudit(CAP_MKNOD)) { ++ stat->atime = stat->ctime; ++ stat->mtime = stat->ctime; ++ } ++ return retval; ++ } + + generic_fillattr(inode, stat); + return 0; +diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c +index d99d166fd892..7a4f2854feb8 100644 +--- a/fs/userfaultfd.c ++++ b/fs/userfaultfd.c +@@ -28,7 +28,11 @@ + #include + #include + ++#ifdef CONFIG_USERFAULTFD_UNPRIVILEGED + int sysctl_unprivileged_userfaultfd __read_mostly = 1; ++#else ++int sysctl_unprivileged_userfaultfd __read_mostly; ++#endif + + static struct kmem_cache *userfaultfd_ctx_cachep __read_mostly; + +diff --git a/include/linux/cache.h b/include/linux/cache.h +index 750621e41d1c..e7157c18c62c 100644 +--- a/include/linux/cache.h ++++ b/include/linux/cache.h +@@ -31,6 +31,8 @@ + #define __ro_after_init __attribute__((__section__(".data..ro_after_init"))) + #endif + ++#define __read_only __ro_after_init ++ + #ifndef ____cacheline_aligned + #define ____cacheline_aligned __attribute__((__aligned__(SMP_CACHE_BYTES))) + #endif +diff --git a/include/linux/capability.h b/include/linux/capability.h +index ecce0f43c73a..e46306dd4401 100644 +--- a/include/linux/capability.h ++++ b/include/linux/capability.h +@@ -208,6 +208,7 @@ extern bool has_capability_noaudit(struct task_struct *t, int cap); + extern bool has_ns_capability_noaudit(struct task_struct *t, + struct user_namespace *ns, int cap); + extern bool capable(int cap); ++extern bool capable_noaudit(int cap); + extern bool ns_capable(struct user_namespace *ns, int cap); + extern bool ns_capable_noaudit(struct user_namespace *ns, int cap); + extern bool ns_capable_setid(struct user_namespace *ns, int cap); +@@ -234,6 +235,10 @@ static inline bool capable(int cap) + { + return true; + } ++static inline bool capable_noaudit(int cap) ++{ ++ return true; ++} + static inline bool ns_capable(struct user_namespace *ns, int cap) + { + return true; +diff --git a/include/linux/fs.h b/include/linux/fs.h +index 0b4d8fc79e0f..6f318e089249 100644 +--- a/include/linux/fs.h ++++ b/include/linux/fs.h +@@ -3627,4 +3627,15 @@ static inline int inode_drain_writes(struct inode *inode) + return filemap_write_and_wait(inode->i_mapping); + } + ++extern int device_sidechannel_restrict; ++ ++static inline bool is_sidechannel_device(const struct inode *inode) ++{ ++ umode_t mode; ++ if (!device_sidechannel_restrict) ++ return false; ++ mode = inode->i_mode; ++ return ((S_ISCHR(mode) || S_ISBLK(mode)) && (mode & (S_IROTH | S_IWOTH))); ++} ++ + #endif /* _LINUX_FS_H */ +diff --git a/include/linux/fsnotify.h b/include/linux/fsnotify.h +index a2d5d175d3c1..e91ab06119b0 100644 +--- a/include/linux/fsnotify.h ++++ b/include/linux/fsnotify.h +@@ -233,6 +233,9 @@ static inline void fsnotify_access(struct file *file) + struct inode *inode = file_inode(file); + __u32 mask = FS_ACCESS; + ++ if (is_sidechannel_device(inode)) ++ return; ++ + if (S_ISDIR(inode->i_mode)) + mask |= FS_ISDIR; + +@@ -249,6 +252,9 @@ static inline void fsnotify_modify(struct file *file) + struct inode *inode = file_inode(file); + __u32 mask = FS_MODIFY; + ++ if (is_sidechannel_device(inode)) ++ return; ++ + if (S_ISDIR(inode->i_mode)) + mask |= FS_ISDIR; + +diff --git a/include/linux/gfp.h b/include/linux/gfp.h +index 61f2f6ff9467..f9b3e3d675ae 100644 +--- a/include/linux/gfp.h ++++ b/include/linux/gfp.h +@@ -553,9 +553,9 @@ extern struct page *alloc_pages_vma(gfp_t gfp_mask, int order, + extern unsigned long __get_free_pages(gfp_t gfp_mask, unsigned int order); + extern unsigned long get_zeroed_page(gfp_t gfp_mask); + +-void *alloc_pages_exact(size_t size, gfp_t gfp_mask); ++void *alloc_pages_exact(size_t size, gfp_t gfp_mask) __attribute__((alloc_size(1))); + void free_pages_exact(void *virt, size_t size); +-void * __meminit alloc_pages_exact_nid(int nid, size_t size, gfp_t gfp_mask); ++void * __meminit alloc_pages_exact_nid(int nid, size_t size, gfp_t gfp_mask) __attribute__((alloc_size(2))); + + #define __get_free_page(gfp_mask) \ + __get_free_pages((gfp_mask), 0) +diff --git a/include/linux/highmem.h b/include/linux/highmem.h +index ea5cdbd8c2c3..805b84d6bbca 100644 +--- a/include/linux/highmem.h ++++ b/include/linux/highmem.h +@@ -215,6 +215,13 @@ static inline void clear_highpage(struct page *page) + kunmap_atomic(kaddr); + } + ++static inline void verify_zero_highpage(struct page *page) ++{ ++ void *kaddr = kmap_atomic(page); ++ BUG_ON(memchr_inv(kaddr, 0, PAGE_SIZE)); ++ kunmap_atomic(kaddr); ++} ++ + static inline void zero_user_segments(struct page *page, + unsigned start1, unsigned end1, + unsigned start2, unsigned end2) +diff --git a/include/linux/interrupt.h b/include/linux/interrupt.h +index 89fc59dab57d..5f98e14e9470 100644 +--- a/include/linux/interrupt.h ++++ b/include/linux/interrupt.h +@@ -540,7 +540,7 @@ extern const char * const softirq_to_name[NR_SOFTIRQS]; + + struct softirq_action + { +- void (*action)(struct softirq_action *); ++ void (*action)(void); + }; + + asmlinkage void do_softirq(void); +@@ -555,7 +555,7 @@ static inline void do_softirq_own_stack(void) + } + #endif + +-extern void open_softirq(int nr, void (*action)(struct softirq_action *)); ++extern void __init open_softirq(int nr, void (*action)(void)); + extern void softirq_init(void); + extern void __raise_softirq_irqoff(unsigned int nr); + +diff --git a/include/linux/kobject_ns.h b/include/linux/kobject_ns.h +index 069aa2ebef90..cb9e3637a620 100644 +--- a/include/linux/kobject_ns.h ++++ b/include/linux/kobject_ns.h +@@ -45,7 +45,7 @@ struct kobj_ns_type_operations { + void (*drop_ns)(void *); + }; + +-int kobj_ns_type_register(const struct kobj_ns_type_operations *ops); ++int __init kobj_ns_type_register(const struct kobj_ns_type_operations *ops); + int kobj_ns_type_registered(enum kobj_ns_type type); + const struct kobj_ns_type_operations *kobj_child_ns_ops(struct kobject *parent); + const struct kobj_ns_type_operations *kobj_ns_ops(struct kobject *kobj); +diff --git a/include/linux/mm.h b/include/linux/mm.h +index b249d2e033aa..a4855777d1fa 100644 +--- a/include/linux/mm.h ++++ b/include/linux/mm.h +@@ -664,7 +664,7 @@ static inline int is_vmalloc_or_module_addr(const void *x) + } + #endif + +-extern void *kvmalloc_node(size_t size, gfp_t flags, int node); ++extern void *kvmalloc_node(size_t size, gfp_t flags, int node) __attribute__((alloc_size(1))); + static inline void *kvmalloc(size_t size, gfp_t flags) + { + return kvmalloc_node(size, flags, NUMA_NO_NODE); +diff --git a/include/linux/percpu.h b/include/linux/percpu.h +index 5e76af742c80..9a6c682ec127 100644 +--- a/include/linux/percpu.h ++++ b/include/linux/percpu.h +@@ -123,7 +123,7 @@ extern int __init pcpu_page_first_chunk(size_t reserved_size, + pcpu_fc_populate_pte_fn_t populate_pte_fn); + #endif + +-extern void __percpu *__alloc_reserved_percpu(size_t size, size_t align); ++extern void __percpu *__alloc_reserved_percpu(size_t size, size_t align) __attribute__((alloc_size(1))); + extern bool __is_kernel_percpu_address(unsigned long addr, unsigned long *can_addr); + extern bool is_kernel_percpu_address(unsigned long addr); + +@@ -131,8 +131,8 @@ extern bool is_kernel_percpu_address(unsigned long addr); + extern void __init setup_per_cpu_areas(void); + #endif + +-extern void __percpu *__alloc_percpu_gfp(size_t size, size_t align, gfp_t gfp); +-extern void __percpu *__alloc_percpu(size_t size, size_t align); ++extern void __percpu *__alloc_percpu_gfp(size_t size, size_t align, gfp_t gfp) __attribute__((alloc_size(1))); ++extern void __percpu *__alloc_percpu(size_t size, size_t align) __attribute__((alloc_size(1))); + extern void free_percpu(void __percpu *__pdata); + extern phys_addr_t per_cpu_ptr_to_phys(void *addr); + +diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h +index 68ccc5b1913b..a7565ea44938 100644 +--- a/include/linux/perf_event.h ++++ b/include/linux/perf_event.h +@@ -1241,6 +1241,11 @@ extern int perf_cpu_time_max_percent_handler(struct ctl_table *table, int write, + int perf_event_max_stack_handler(struct ctl_table *table, int write, + void __user *buffer, size_t *lenp, loff_t *ppos); + ++static inline bool perf_paranoid_any(void) ++{ ++ return sysctl_perf_event_paranoid > 2; ++} ++ + static inline bool perf_paranoid_tracepoint_raw(void) + { + return sysctl_perf_event_paranoid > -1; +diff --git a/include/linux/slab.h b/include/linux/slab.h +index 4d2a2fa55ed5..be3a8234edde 100644 +--- a/include/linux/slab.h ++++ b/include/linux/slab.h +@@ -184,8 +184,8 @@ void memcg_deactivate_kmem_caches(struct mem_cgroup *, struct mem_cgroup *); + /* + * Common kmalloc functions provided by all allocators + */ +-void * __must_check __krealloc(const void *, size_t, gfp_t); +-void * __must_check krealloc(const void *, size_t, gfp_t); ++void * __must_check __krealloc(const void *, size_t, gfp_t) __attribute__((alloc_size(2))); ++void * __must_check krealloc(const void *, size_t, gfp_t) __attribute((alloc_size(2))); + void kfree(const void *); + void kzfree(const void *); + size_t __ksize(const void *); +@@ -390,7 +390,7 @@ static __always_inline unsigned int kmalloc_index(size_t size) + } + #endif /* !CONFIG_SLOB */ + +-void *__kmalloc(size_t size, gfp_t flags) __assume_kmalloc_alignment __malloc; ++void *__kmalloc(size_t size, gfp_t flags) __assume_kmalloc_alignment __malloc __attribute__((alloc_size(1))); + void *kmem_cache_alloc(struct kmem_cache *, gfp_t flags) __assume_slab_alignment __malloc; + void kmem_cache_free(struct kmem_cache *, void *); + +@@ -414,7 +414,7 @@ static __always_inline void kfree_bulk(size_t size, void **p) + } + + #ifdef CONFIG_NUMA +-void *__kmalloc_node(size_t size, gfp_t flags, int node) __assume_kmalloc_alignment __malloc; ++void *__kmalloc_node(size_t size, gfp_t flags, int node) __assume_kmalloc_alignment __malloc __attribute__((alloc_size(1))); + void *kmem_cache_alloc_node(struct kmem_cache *, gfp_t flags, int node) __assume_slab_alignment __malloc; + #else + static __always_inline void *__kmalloc_node(size_t size, gfp_t flags, int node) +@@ -539,7 +539,7 @@ static __always_inline void *kmalloc_large(size_t size, gfp_t flags) + * Try really hard to succeed the allocation but fail + * eventually. + */ +-static __always_inline void *kmalloc(size_t size, gfp_t flags) ++static __always_inline __attribute__((alloc_size(1))) void *kmalloc(size_t size, gfp_t flags) + { + if (__builtin_constant_p(size)) { + #ifndef CONFIG_SLOB +@@ -581,7 +581,7 @@ static __always_inline unsigned int kmalloc_size(unsigned int n) + return 0; + } + +-static __always_inline void *kmalloc_node(size_t size, gfp_t flags, int node) ++static __always_inline __attribute__((alloc_size(1))) void *kmalloc_node(size_t size, gfp_t flags, int node) + { + #ifndef CONFIG_SLOB + if (__builtin_constant_p(size) && +diff --git a/include/linux/slub_def.h b/include/linux/slub_def.h +index d2153789bd9f..97da977d6060 100644 +--- a/include/linux/slub_def.h ++++ b/include/linux/slub_def.h +@@ -121,6 +121,11 @@ struct kmem_cache { + unsigned long random; + #endif + ++#ifdef CONFIG_SLAB_CANARY ++ unsigned long random_active; ++ unsigned long random_inactive; ++#endif ++ + #ifdef CONFIG_NUMA + /* + * Defragmentation by allocating from a remote node. +diff --git a/include/linux/string.h b/include/linux/string.h +index b6ccdc2c7f02..6d66b8740f90 100644 +--- a/include/linux/string.h ++++ b/include/linux/string.h +@@ -268,10 +268,16 @@ void __read_overflow2(void) __compiletime_error("detected read beyond size of ob + void __read_overflow3(void) __compiletime_error("detected read beyond size of object passed as 3rd parameter"); + void __write_overflow(void) __compiletime_error("detected write beyond size of object passed as 1st parameter"); + ++#ifdef CONFIG_FORTIFY_SOURCE_STRICT_STRING ++#define __string_size(p) __builtin_object_size(p, 1) ++#else ++#define __string_size(p) __builtin_object_size(p, 0) ++#endif ++ + #if !defined(__NO_FORTIFY) && defined(__OPTIMIZE__) && defined(CONFIG_FORTIFY_SOURCE) + __FORTIFY_INLINE char *strncpy(char *p, const char *q, __kernel_size_t size) + { +- size_t p_size = __builtin_object_size(p, 0); ++ size_t p_size = __string_size(p); + if (__builtin_constant_p(size) && p_size < size) + __write_overflow(); + if (p_size < size) +@@ -281,7 +287,7 @@ __FORTIFY_INLINE char *strncpy(char *p, const char *q, __kernel_size_t size) + + __FORTIFY_INLINE char *strcat(char *p, const char *q) + { +- size_t p_size = __builtin_object_size(p, 0); ++ size_t p_size = __string_size(p); + if (p_size == (size_t)-1) + return __builtin_strcat(p, q); + if (strlcat(p, q, p_size) >= p_size) +@@ -292,7 +298,7 @@ __FORTIFY_INLINE char *strcat(char *p, const char *q) + __FORTIFY_INLINE __kernel_size_t strlen(const char *p) + { + __kernel_size_t ret; +- size_t p_size = __builtin_object_size(p, 0); ++ size_t p_size = __string_size(p); + + /* Work around gcc excess stack consumption issue */ + if (p_size == (size_t)-1 || +@@ -307,7 +313,7 @@ __FORTIFY_INLINE __kernel_size_t strlen(const char *p) + extern __kernel_size_t __real_strnlen(const char *, __kernel_size_t) __RENAME(strnlen); + __FORTIFY_INLINE __kernel_size_t strnlen(const char *p, __kernel_size_t maxlen) + { +- size_t p_size = __builtin_object_size(p, 0); ++ size_t p_size = __string_size(p); + __kernel_size_t ret = __real_strnlen(p, maxlen < p_size ? maxlen : p_size); + if (p_size <= ret && maxlen != ret) + fortify_panic(__func__); +@@ -319,8 +325,8 @@ extern size_t __real_strlcpy(char *, const char *, size_t) __RENAME(strlcpy); + __FORTIFY_INLINE size_t strlcpy(char *p, const char *q, size_t size) + { + size_t ret; +- size_t p_size = __builtin_object_size(p, 0); +- size_t q_size = __builtin_object_size(q, 0); ++ size_t p_size = __string_size(p); ++ size_t q_size = __string_size(q); + if (p_size == (size_t)-1 && q_size == (size_t)-1) + return __real_strlcpy(p, q, size); + ret = strlen(q); +@@ -340,8 +346,8 @@ __FORTIFY_INLINE size_t strlcpy(char *p, const char *q, size_t size) + __FORTIFY_INLINE char *strncat(char *p, const char *q, __kernel_size_t count) + { + size_t p_len, copy_len; +- size_t p_size = __builtin_object_size(p, 0); +- size_t q_size = __builtin_object_size(q, 0); ++ size_t p_size = __string_size(p); ++ size_t q_size = __string_size(q); + if (p_size == (size_t)-1 && q_size == (size_t)-1) + return __builtin_strncat(p, q, count); + p_len = strlen(p); +@@ -454,8 +460,8 @@ __FORTIFY_INLINE void *kmemdup(const void *p, size_t size, gfp_t gfp) + /* defined after fortified strlen and memcpy to reuse them */ + __FORTIFY_INLINE char *strcpy(char *p, const char *q) + { +- size_t p_size = __builtin_object_size(p, 0); +- size_t q_size = __builtin_object_size(q, 0); ++ size_t p_size = __string_size(p); ++ size_t q_size = __string_size(q); + if (p_size == (size_t)-1 && q_size == (size_t)-1) + return __builtin_strcpy(p, q); + memcpy(p, q, strlen(q) + 1); +diff --git a/include/linux/tty.h b/include/linux/tty.h +index bfa4e2ee94a9..3e18d583fc8d 100644 +--- a/include/linux/tty.h ++++ b/include/linux/tty.h +@@ -14,6 +14,7 @@ + #include + #include + #include ++#include + + + /* +@@ -336,6 +337,7 @@ struct tty_struct { + /* If the tty has a pending do_SAK, queue it here - akpm */ + struct work_struct SAK_work; + struct tty_port *port; ++ struct user_namespace *owner_user_ns; + } __randomize_layout; + + /* Each of a tty's open files has private_data pointing to tty_file_private */ +@@ -345,6 +347,8 @@ struct tty_file_private { + struct list_head list; + }; + ++extern int tiocsti_restrict; ++ + /* tty magic number */ + #define TTY_MAGIC 0x5401 + +diff --git a/include/linux/vmalloc.h b/include/linux/vmalloc.h +index 4e7809408073..0b58a5176a25 100644 +--- a/include/linux/vmalloc.h ++++ b/include/linux/vmalloc.h +@@ -88,19 +88,19 @@ static inline void vmalloc_init(void) + static inline unsigned long vmalloc_nr_pages(void) { return 0; } + #endif + +-extern void *vmalloc(unsigned long size); +-extern void *vzalloc(unsigned long size); +-extern void *vmalloc_user(unsigned long size); +-extern void *vmalloc_node(unsigned long size, int node); +-extern void *vzalloc_node(unsigned long size, int node); +-extern void *vmalloc_exec(unsigned long size); +-extern void *vmalloc_32(unsigned long size); +-extern void *vmalloc_32_user(unsigned long size); +-extern void *__vmalloc(unsigned long size, gfp_t gfp_mask, pgprot_t prot); ++extern void *vmalloc(unsigned long size) __attribute__((alloc_size(1))); ++extern void *vzalloc(unsigned long size) __attribute__((alloc_size(1))); ++extern void *vmalloc_user(unsigned long size) __attribute__((alloc_size(1))); ++extern void *vmalloc_node(unsigned long size, int node) __attribute__((alloc_size(1))); ++extern void *vzalloc_node(unsigned long size, int node) __attribute__((alloc_size(1))); ++extern void *vmalloc_exec(unsigned long size) __attribute__((alloc_size(1))); ++extern void *vmalloc_32(unsigned long size) __attribute__((alloc_size(1))); ++extern void *vmalloc_32_user(unsigned long size) __attribute__((alloc_size(1))); ++extern void *__vmalloc(unsigned long size, gfp_t gfp_mask, pgprot_t prot) __attribute__((alloc_size(1))); + extern void *__vmalloc_node_range(unsigned long size, unsigned long align, + unsigned long start, unsigned long end, gfp_t gfp_mask, + pgprot_t prot, unsigned long vm_flags, int node, +- const void *caller); ++ const void *caller) __attribute__((alloc_size(1))); + #ifndef CONFIG_MMU + extern void *__vmalloc_node_flags(unsigned long size, int node, gfp_t flags); + static inline void *__vmalloc_node_flags_caller(unsigned long size, int node, +diff --git a/init/Kconfig b/init/Kconfig +index b4daad2bac23..c1016fd960f0 100644 +--- a/init/Kconfig ++++ b/init/Kconfig +@@ -381,6 +381,7 @@ config USELIB + config AUDIT + bool "Auditing support" + depends on NET ++ default y + help + Enable auditing infrastructure that can be used with another + kernel subsystem, such as SELinux (which requires this for +@@ -1118,6 +1119,22 @@ config USER_NS + + If unsure, say N. + ++config USER_NS_UNPRIVILEGED ++ bool "Allow unprivileged users to create namespaces" ++ depends on USER_NS ++ default n ++ help ++ When disabled, unprivileged users will not be able to create ++ new namespaces. Allowing users to create their own namespaces ++ has been part of several recent local privilege escalation ++ exploits, so if you need user namespaces but are ++ paranoid^Wsecurity-conscious you want to disable this. ++ ++ This setting can be overridden at runtime via the ++ kernel.unprivileged_userns_clone sysctl. ++ ++ If unsure, say N. ++ + config PID_NS + bool "PID Namespaces" + default y +@@ -1538,8 +1555,7 @@ config SHMEM + which may be appropriate on small systems without swap. + + config AIO +- bool "Enable AIO support" if EXPERT +- default y ++ bool "Enable AIO support" + help + This option enables POSIX asynchronous I/O which may by used + by some high performance threaded applications. Disabling +@@ -1650,6 +1666,23 @@ config USERFAULTFD + Enable the userfaultfd() system call that allows to intercept and + handle page faults in userland. + ++config USERFAULTFD_UNPRIVILEGED ++ bool "Allow unprivileged users to use the userfaultfd syscall" ++ depends on USERFAULTFD ++ default n ++ help ++ When disabled, unprivileged users will not be able to use the userfaultfd ++ syscall. Userfaultfd provide attackers with a way to stall a kernel ++ thread in the middle of memory accesses from userspace by initiating an ++ access on an unmapped page. To avoid various heap grooming and heap ++ spraying techniques for exploiting use-after-free flaws this should be ++ disabled by default. ++ ++ This setting can be overridden at runtime via the ++ vm.unprivileged_userfaultfd sysctl. ++ ++ If unsure, say N. ++ + config ARCH_HAS_MEMBARRIER_CALLBACKS + bool + +@@ -1762,7 +1795,7 @@ config VM_EVENT_COUNTERS + + config SLUB_DEBUG + default y +- bool "Enable SLUB debugging support" if EXPERT ++ bool "Enable SLUB debugging support" + depends on SLUB && SYSFS + help + SLUB has extensive debug support features. Disabling these can +@@ -1786,7 +1819,6 @@ config SLUB_MEMCG_SYSFS_ON + + config COMPAT_BRK + bool "Disable heap randomization" +- default y + help + Randomizing heap placement makes heap exploits harder, but it + also breaks ancient binaries (including anything libc5 based). +@@ -1833,7 +1865,6 @@ endchoice + + config SLAB_MERGE_DEFAULT + bool "Allow slab caches to be merged" +- default y + help + For reduced kernel memory fragmentation, slab caches can be + merged when they share the same size and other characteristics. +@@ -1846,9 +1877,9 @@ config SLAB_MERGE_DEFAULT + command line. + + config SLAB_FREELIST_RANDOM +- default n + depends on SLAB || SLUB + bool "SLAB freelist randomization" ++ default y + help + Randomizes the freelist order used on creating new pages. This + security feature reduces the predictability of the kernel slab +@@ -1857,12 +1888,30 @@ config SLAB_FREELIST_RANDOM + config SLAB_FREELIST_HARDENED + bool "Harden slab freelist metadata" + depends on SLUB ++ default y + help + Many kernel heap attacks try to target slab cache metadata and + other infrastructure. This options makes minor performance + sacrifices to harden the kernel slab allocator against common + freelist exploit methods. + ++config SLAB_CANARY ++ depends on SLUB ++ depends on !SLAB_MERGE_DEFAULT ++ bool "SLAB canaries" ++ default y ++ help ++ Place canaries at the end of kernel slab allocations, sacrificing ++ some performance and memory usage for security. ++ ++ Canaries can detect some forms of heap corruption when allocations ++ are freed and as part of the HARDENED_USERCOPY feature. It provides ++ basic use-after-free detection for HARDENED_USERCOPY. ++ ++ Canaries absorb small overflows (rendering them harmless), mitigate ++ non-NUL terminated C string overflows on 64-bit via a guaranteed zero ++ byte and provide basic double-free detection. ++ + config SHUFFLE_PAGE_ALLOCATOR + bool "Page allocator randomization" + default SLAB_FREELIST_RANDOM && ACPI_NUMA +diff --git a/kernel/audit.c b/kernel/audit.c +index da8dc0db5bd3..62dda6867dd9 100644 +--- a/kernel/audit.c ++++ b/kernel/audit.c +@@ -1628,6 +1628,9 @@ static int __init audit_enable(char *str) + + if (audit_default == AUDIT_OFF) + audit_initialized = AUDIT_DISABLED; ++ else if (!audit_ever_enabled) ++ audit_initialized = AUDIT_UNINITIALIZED; ++ + if (audit_set_enabled(audit_default)) + pr_err("audit: error setting audit state (%d)\n", + audit_default); +diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c +index ef0e1e3e66f4..d1ddc8695ab8 100644 +--- a/kernel/bpf/core.c ++++ b/kernel/bpf/core.c +@@ -519,7 +519,7 @@ void bpf_prog_kallsyms_del_all(struct bpf_prog *fp) + #ifdef CONFIG_BPF_JIT + /* All BPF JIT sysctl knobs here. */ + int bpf_jit_enable __read_mostly = IS_BUILTIN(CONFIG_BPF_JIT_ALWAYS_ON); +-int bpf_jit_harden __read_mostly; ++int bpf_jit_harden __read_mostly = 2; + int bpf_jit_kallsyms __read_mostly; + long bpf_jit_limit __read_mostly; + +diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c +index ace1cfaa24b6..37e08fc44a6b 100644 +--- a/kernel/bpf/syscall.c ++++ b/kernel/bpf/syscall.c +@@ -39,7 +39,7 @@ static DEFINE_SPINLOCK(prog_idr_lock); + static DEFINE_IDR(map_idr); + static DEFINE_SPINLOCK(map_idr_lock); + +-int sysctl_unprivileged_bpf_disabled __read_mostly; ++int sysctl_unprivileged_bpf_disabled __read_mostly = 1; + + static const struct bpf_map_ops * const bpf_map_types[] = { + #define BPF_PROG_TYPE(_id, _ops) +diff --git a/kernel/capability.c b/kernel/capability.c +index 1444f3954d75..8cc9dd7992f2 100644 +--- a/kernel/capability.c ++++ b/kernel/capability.c +@@ -449,6 +449,12 @@ bool capable(int cap) + return ns_capable(&init_user_ns, cap); + } + EXPORT_SYMBOL(capable); ++ ++bool capable_noaudit(int cap) ++{ ++ return ns_capable_noaudit(&init_user_ns, cap); ++} ++EXPORT_SYMBOL(capable_noaudit); + #endif /* CONFIG_MULTIUSER */ + + /** +diff --git a/kernel/events/core.c b/kernel/events/core.c +index 6c829e22bad3..3063a7239a94 100644 +--- a/kernel/events/core.c ++++ b/kernel/events/core.c +@@ -398,8 +398,13 @@ static cpumask_var_t perf_online_mask; + * 0 - disallow raw tracepoint access for unpriv + * 1 - disallow cpu events for unpriv + * 2 - disallow kernel profiling for unpriv ++ * 3 - disallow all unpriv perf event use + */ ++#ifdef CONFIG_SECURITY_PERF_EVENTS_RESTRICT ++int sysctl_perf_event_paranoid __read_mostly = 3; ++#else + int sysctl_perf_event_paranoid __read_mostly = 2; ++#endif + + /* Minimum for 512 kiB + 1 user control page */ + int sysctl_perf_event_mlock __read_mostly = 512 + (PAGE_SIZE / 1024); /* 'free' kiB per user */ +@@ -10895,6 +10900,9 @@ SYSCALL_DEFINE5(perf_event_open, + if (flags & ~PERF_FLAG_ALL) + return -EINVAL; + ++ if (perf_paranoid_any() && !capable(CAP_SYS_ADMIN)) ++ return -EACCES; ++ + err = perf_copy_attr(attr_uptr, &attr); + if (err) + return err; +diff --git a/kernel/fork.c b/kernel/fork.c +index 755d8160e001..ed909f8050b2 100644 +--- a/kernel/fork.c ++++ b/kernel/fork.c +@@ -106,6 +106,11 @@ + + #define CREATE_TRACE_POINTS + #include ++#ifdef CONFIG_USER_NS ++extern int unprivileged_userns_clone; ++#else ++#define unprivileged_userns_clone 0 ++#endif + + /* + * Minimum number of threads to boot the kernel +@@ -1779,6 +1784,10 @@ static __latent_entropy struct task_struct *copy_process( + if ((clone_flags & (CLONE_NEWUSER|CLONE_FS)) == (CLONE_NEWUSER|CLONE_FS)) + return ERR_PTR(-EINVAL); + ++ if ((clone_flags & CLONE_NEWUSER) && !unprivileged_userns_clone) ++ if (!capable(CAP_SYS_ADMIN)) ++ return ERR_PTR(-EPERM); ++ + /* + * Thread groups must share signals as well, and detached threads + * can only be started up within the thread group. +@@ -2836,6 +2845,12 @@ int ksys_unshare(unsigned long unshare_flags) + if (unshare_flags & CLONE_NEWNS) + unshare_flags |= CLONE_FS; + ++ if ((unshare_flags & CLONE_NEWUSER) && !unprivileged_userns_clone) { ++ err = -EPERM; ++ if (!capable(CAP_SYS_ADMIN)) ++ goto bad_unshare_out; ++ } ++ + err = check_unshare_flags(unshare_flags); + if (err) + goto bad_unshare_out; +diff --git a/kernel/rcu/tiny.c b/kernel/rcu/tiny.c +index 477b4eb44af5..db28cc3fd301 100644 +--- a/kernel/rcu/tiny.c ++++ b/kernel/rcu/tiny.c +@@ -74,7 +74,7 @@ void rcu_sched_clock_irq(int user) + } + + /* Invoke the RCU callbacks whose grace period has elapsed. */ +-static __latent_entropy void rcu_process_callbacks(struct softirq_action *unused) ++static __latent_entropy void rcu_process_callbacks(void) + { + struct rcu_head *next, *list; + unsigned long flags; +diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c +index 81105141b6a8..38f04f653d29 100644 +--- a/kernel/rcu/tree.c ++++ b/kernel/rcu/tree.c +@@ -2381,7 +2381,7 @@ static __latent_entropy void rcu_core(void) + trace_rcu_utilization(TPS("End RCU core")); + } + +-static void rcu_core_si(struct softirq_action *h) ++static void rcu_core_si(void) + { + rcu_core(); + } +diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c +index c87a798d1456..341c384cc597 100644 +--- a/kernel/sched/fair.c ++++ b/kernel/sched/fair.c +@@ -9889,7 +9889,7 @@ int newidle_balance(struct rq *this_rq, struct rq_flags *rf) + * run_rebalance_domains is triggered when needed from the scheduler tick. + * Also triggered for nohz idle balancing (with nohz_balancing_kick set). + */ +-static __latent_entropy void run_rebalance_domains(struct softirq_action *h) ++static __latent_entropy void run_rebalance_domains(void) + { + struct rq *this_rq = this_rq(); + enum cpu_idle_type idle = this_rq->idle_balance ? +diff --git a/kernel/softirq.c b/kernel/softirq.c +index 0427a86743a4..5e6a9b4ccb41 100644 +--- a/kernel/softirq.c ++++ b/kernel/softirq.c +@@ -52,7 +52,7 @@ DEFINE_PER_CPU_ALIGNED(irq_cpustat_t, irq_stat); + EXPORT_PER_CPU_SYMBOL(irq_stat); + #endif + +-static struct softirq_action softirq_vec[NR_SOFTIRQS] __cacheline_aligned_in_smp; ++static struct softirq_action softirq_vec[NR_SOFTIRQS] __ro_after_init __aligned(PAGE_SIZE); + + DEFINE_PER_CPU(struct task_struct *, ksoftirqd); + +@@ -289,7 +289,7 @@ asmlinkage __visible void __softirq_entry __do_softirq(void) + kstat_incr_softirqs_this_cpu(vec_nr); + + trace_softirq_entry(vec_nr); +- h->action(h); ++ h->action(); + trace_softirq_exit(vec_nr); + if (unlikely(prev_count != preempt_count())) { + pr_err("huh, entered softirq %u %s %p with preempt_count %08x, exited with %08x?\n", +@@ -452,7 +452,7 @@ void __raise_softirq_irqoff(unsigned int nr) + or_softirq_pending(1UL << nr); + } + +-void open_softirq(int nr, void (*action)(struct softirq_action *)) ++void __init open_softirq(int nr, void (*action)(void)) + { + softirq_vec[nr].action = action; + } +@@ -498,8 +498,7 @@ void __tasklet_hi_schedule(struct tasklet_struct *t) + } + EXPORT_SYMBOL(__tasklet_hi_schedule); + +-static void tasklet_action_common(struct softirq_action *a, +- struct tasklet_head *tl_head, ++static void tasklet_action_common(struct tasklet_head *tl_head, + unsigned int softirq_nr) + { + struct tasklet_struct *list; +@@ -536,14 +535,14 @@ static void tasklet_action_common(struct softirq_action *a, + } + } + +-static __latent_entropy void tasklet_action(struct softirq_action *a) ++static __latent_entropy void tasklet_action(void) + { +- tasklet_action_common(a, this_cpu_ptr(&tasklet_vec), TASKLET_SOFTIRQ); ++ tasklet_action_common(this_cpu_ptr(&tasklet_vec), TASKLET_SOFTIRQ); + } + +-static __latent_entropy void tasklet_hi_action(struct softirq_action *a) ++static __latent_entropy void tasklet_hi_action(void) + { +- tasklet_action_common(a, this_cpu_ptr(&tasklet_hi_vec), HI_SOFTIRQ); ++ tasklet_action_common(this_cpu_ptr(&tasklet_hi_vec), HI_SOFTIRQ); + } + + void tasklet_init(struct tasklet_struct *t, +diff --git a/kernel/sysctl.c b/kernel/sysctl.c +index 70665934d53e..8ea67d08b926 100644 +--- a/kernel/sysctl.c ++++ b/kernel/sysctl.c +@@ -68,6 +68,7 @@ + #include + #include + #include ++#include + + #include "../lib/kstrtox.h" + +@@ -104,12 +105,19 @@ + #if defined(CONFIG_SYSCTL) + + /* External variables not in a header file. */ ++#if IS_ENABLED(CONFIG_USB) ++int deny_new_usb __read_mostly = 0; ++EXPORT_SYMBOL(deny_new_usb); ++#endif + extern int suid_dumpable; + #ifdef CONFIG_COREDUMP + extern int core_uses_pid; + extern char core_pattern[]; + extern unsigned int core_pipe_limit; + #endif ++#ifdef CONFIG_USER_NS ++extern int unprivileged_userns_clone; ++#endif + extern int pid_max; + extern int pid_max_min, pid_max_max; + extern int percpu_pagelist_fraction; +@@ -121,32 +129,32 @@ extern int sysctl_nr_trim_pages; + + /* Constants used for minimum and maximum */ + #ifdef CONFIG_LOCKUP_DETECTOR +-static int sixty = 60; ++static int sixty __read_only = 60; + #endif + +-static int __maybe_unused neg_one = -1; +-static int __maybe_unused two = 2; +-static int __maybe_unused four = 4; +-static unsigned long zero_ul; +-static unsigned long one_ul = 1; +-static unsigned long long_max = LONG_MAX; +-static int one_hundred = 100; +-static int one_thousand = 1000; ++static int __maybe_unused neg_one __read_only = -1; ++static int __maybe_unused two __read_only = 2; ++static int __maybe_unused four __read_only = 4; ++static unsigned long zero_ul __read_only; ++static unsigned long one_ul __read_only = 1; ++static unsigned long long_max __read_only = LONG_MAX; ++static int one_hundred __read_only = 100; ++static int one_thousand __read_only = 1000; + #ifdef CONFIG_PRINTK +-static int ten_thousand = 10000; ++static int ten_thousand __read_only = 10000; + #endif + #ifdef CONFIG_PERF_EVENTS +-static int six_hundred_forty_kb = 640 * 1024; ++static int six_hundred_forty_kb __read_only = 640 * 1024; + #endif + + /* this is needed for the proc_doulongvec_minmax of vm_dirty_bytes */ +-static unsigned long dirty_bytes_min = 2 * PAGE_SIZE; ++static unsigned long dirty_bytes_min __read_only = 2 * PAGE_SIZE; + + /* this is needed for the proc_dointvec_minmax for [fs_]overflow UID and GID */ +-static int maxolduid = 65535; +-static int minolduid; ++static int maxolduid __read_only = 65535; ++static int minolduid __read_only; + +-static int ngroups_max = NGROUPS_MAX; ++static int ngroups_max __read_only = NGROUPS_MAX; + static const int cap_last_cap = CAP_LAST_CAP; + + /* +@@ -154,9 +162,12 @@ static const int cap_last_cap = CAP_LAST_CAP; + * and hung_task_check_interval_secs + */ + #ifdef CONFIG_DETECT_HUNG_TASK +-static unsigned long hung_task_timeout_max = (LONG_MAX/HZ); ++static unsigned long hung_task_timeout_max __read_only = (LONG_MAX/HZ); + #endif + ++int device_sidechannel_restrict __read_mostly = 1; ++EXPORT_SYMBOL(device_sidechannel_restrict); ++ + #ifdef CONFIG_INOTIFY_USER + #include + #endif +@@ -301,19 +312,19 @@ static struct ctl_table sysctl_base_table[] = { + }; + + #ifdef CONFIG_SCHED_DEBUG +-static int min_sched_granularity_ns = 100000; /* 100 usecs */ +-static int max_sched_granularity_ns = NSEC_PER_SEC; /* 1 second */ +-static int min_wakeup_granularity_ns; /* 0 usecs */ +-static int max_wakeup_granularity_ns = NSEC_PER_SEC; /* 1 second */ ++static int min_sched_granularity_ns __read_only = 100000; /* 100 usecs */ ++static int max_sched_granularity_ns __read_only = NSEC_PER_SEC; /* 1 second */ ++static int min_wakeup_granularity_ns __read_only; /* 0 usecs */ ++static int max_wakeup_granularity_ns __read_only = NSEC_PER_SEC; /* 1 second */ + #ifdef CONFIG_SMP +-static int min_sched_tunable_scaling = SCHED_TUNABLESCALING_NONE; +-static int max_sched_tunable_scaling = SCHED_TUNABLESCALING_END-1; ++static int min_sched_tunable_scaling __read_only = SCHED_TUNABLESCALING_NONE; ++static int max_sched_tunable_scaling __read_only = SCHED_TUNABLESCALING_END-1; + #endif /* CONFIG_SMP */ + #endif /* CONFIG_SCHED_DEBUG */ + + #ifdef CONFIG_COMPACTION +-static int min_extfrag_threshold; +-static int max_extfrag_threshold = 1000; ++static int min_extfrag_threshold __read_only; ++static int max_extfrag_threshold __read_only = 1000; + #endif + + static struct ctl_table kern_table[] = { +@@ -546,6 +557,15 @@ static struct ctl_table kern_table[] = { + .proc_handler = proc_dointvec, + }, + #endif ++#ifdef CONFIG_USER_NS ++ { ++ .procname = "unprivileged_userns_clone", ++ .data = &unprivileged_userns_clone, ++ .maxlen = sizeof(int), ++ .mode = 0644, ++ .proc_handler = proc_dointvec, ++ }, ++#endif + #ifdef CONFIG_PROC_SYSCTL + { + .procname = "tainted", +@@ -901,6 +921,37 @@ static struct ctl_table kern_table[] = { + .extra1 = SYSCTL_ZERO, + .extra2 = &two, + }, ++#endif ++#if defined CONFIG_TTY ++ { ++ .procname = "tiocsti_restrict", ++ .data = &tiocsti_restrict, ++ .maxlen = sizeof(int), ++ .mode = 0644, ++ .proc_handler = proc_dointvec_minmax_sysadmin, ++ .extra1 = SYSCTL_ZERO, ++ .extra2 = SYSCTL_ONE, ++ }, ++#endif ++ { ++ .procname = "device_sidechannel_restrict", ++ .data = &device_sidechannel_restrict, ++ .maxlen = sizeof(int), ++ .mode = 0644, ++ .proc_handler = proc_dointvec_minmax_sysadmin, ++ .extra1 = SYSCTL_ZERO, ++ .extra2 = SYSCTL_ONE, ++ }, ++#if IS_ENABLED(CONFIG_USB) ++ { ++ .procname = "deny_new_usb", ++ .data = &deny_new_usb, ++ .maxlen = sizeof(int), ++ .mode = 0644, ++ .proc_handler = proc_dointvec_minmax_sysadmin, ++ .extra1 = SYSCTL_ZERO, ++ .extra2 = SYSCTL_ONE, ++ }, + #endif + { + .procname = "ngroups_max", +diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c +index 7f31932216a1..9ede224fc81f 100644 +--- a/kernel/time/hrtimer.c ++++ b/kernel/time/hrtimer.c +@@ -1583,7 +1583,7 @@ static void __hrtimer_run_queues(struct hrtimer_cpu_base *cpu_base, ktime_t now, + } + } + +-static __latent_entropy void hrtimer_run_softirq(struct softirq_action *h) ++static __latent_entropy void hrtimer_run_softirq(void) + { + struct hrtimer_cpu_base *cpu_base = this_cpu_ptr(&hrtimer_bases); + unsigned long flags; +diff --git a/kernel/time/timer.c b/kernel/time/timer.c +index 4820823515e9..1a61e5aa87ae 100644 +--- a/kernel/time/timer.c ++++ b/kernel/time/timer.c +@@ -1779,7 +1779,7 @@ static inline void __run_timers(struct timer_base *base) + /* + * This function runs timers and the timer-tq in bottom half context. + */ +-static __latent_entropy void run_timer_softirq(struct softirq_action *h) ++static __latent_entropy void run_timer_softirq(void) + { + struct timer_base *base = this_cpu_ptr(&timer_bases[BASE_STD]); + +diff --git a/kernel/user_namespace.c b/kernel/user_namespace.c +index 8eadadc478f9..c36ecd19562c 100644 +--- a/kernel/user_namespace.c ++++ b/kernel/user_namespace.c +@@ -21,6 +21,13 @@ + #include + #include + ++/* sysctl */ ++#ifdef CONFIG_USER_NS_UNPRIVILEGED ++int unprivileged_userns_clone = 1; ++#else ++int unprivileged_userns_clone; ++#endif ++ + static struct kmem_cache *user_ns_cachep __read_mostly; + static DEFINE_MUTEX(userns_state_mutex); + +diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug +index 93d97f9b0157..fb923cae2120 100644 +--- a/lib/Kconfig.debug ++++ b/lib/Kconfig.debug +@@ -352,6 +352,9 @@ config SECTION_MISMATCH_WARN_ONLY + + If unsure, say Y. + ++config DEBUG_WRITABLE_FUNCTION_POINTERS_VERBOSE ++ bool "Enable verbose reporting of writable function pointers" ++ + # + # Select this config option from the architecture Kconfig, if it + # is preferred to always offer frame pointers as a config +@@ -974,6 +977,7 @@ endmenu # "Debug lockups and hangs" + + config PANIC_ON_OOPS + bool "Panic on Oops" ++ default y + help + Say Y here to enable the kernel to panic when it oopses. This + has the same effect as setting oops=panic on the kernel command +@@ -983,7 +987,7 @@ config PANIC_ON_OOPS + anything erroneous after an oops which could result in data + corruption or other issues. + +- Say N if unsure. ++ Say Y if unsure. + + config PANIC_ON_OOPS_VALUE + int +@@ -1352,6 +1356,7 @@ config DEBUG_BUGVERBOSE + config DEBUG_LIST + bool "Debug linked list manipulation" + depends on DEBUG_KERNEL || BUG_ON_DATA_CORRUPTION ++ default y + help + Enable this to turn on extended checks in the linked-list + walking routines. +@@ -2073,6 +2078,7 @@ config MEMTEST + config BUG_ON_DATA_CORRUPTION + bool "Trigger a BUG when data corruption is detected" + select DEBUG_LIST ++ default y + help + Select this option if the kernel should BUG when it encounters + data corruption in kernel memory structures when they get checked +@@ -2112,6 +2118,7 @@ config STRICT_DEVMEM + config IO_STRICT_DEVMEM + bool "Filter I/O access to /dev/mem" + depends on STRICT_DEVMEM ++ default y + ---help--- + If this option is disabled, you allow userspace (root) access to all + io-memory regardless of whether a driver is actively using that +diff --git a/lib/irq_poll.c b/lib/irq_poll.c +index 2f17b488d58e..b6e7996a0058 100644 +--- a/lib/irq_poll.c ++++ b/lib/irq_poll.c +@@ -75,7 +75,7 @@ void irq_poll_complete(struct irq_poll *iop) + } + EXPORT_SYMBOL(irq_poll_complete); + +-static void __latent_entropy irq_poll_softirq(struct softirq_action *h) ++static void __latent_entropy irq_poll_softirq(void) + { + struct list_head *list = this_cpu_ptr(&blk_cpu_iopoll); + int rearm = 0, budget = irq_poll_budget; +diff --git a/lib/kobject.c b/lib/kobject.c +index 83198cb37d8d..4a053b7aef42 100644 +--- a/lib/kobject.c ++++ b/lib/kobject.c +@@ -1009,9 +1009,9 @@ EXPORT_SYMBOL_GPL(kset_create_and_add); + + + static DEFINE_SPINLOCK(kobj_ns_type_lock); +-static const struct kobj_ns_type_operations *kobj_ns_ops_tbl[KOBJ_NS_TYPES]; ++static const struct kobj_ns_type_operations *kobj_ns_ops_tbl[KOBJ_NS_TYPES] __ro_after_init; + +-int kobj_ns_type_register(const struct kobj_ns_type_operations *ops) ++int __init kobj_ns_type_register(const struct kobj_ns_type_operations *ops) + { + enum kobj_ns_type type = ops->type; + int error; +diff --git a/lib/nlattr.c b/lib/nlattr.c +index cace9b307781..39ba1387045d 100644 +--- a/lib/nlattr.c ++++ b/lib/nlattr.c +@@ -571,6 +571,8 @@ int nla_memcpy(void *dest, const struct nlattr *src, int count) + { + int minlen = min_t(int, count, nla_len(src)); + ++ BUG_ON(minlen < 0); ++ + memcpy(dest, nla_data(src), minlen); + if (count > minlen) + memset(dest + minlen, 0, count - minlen); +diff --git a/lib/vsprintf.c b/lib/vsprintf.c +index e78017a3e1bd..ac5a5b5a439b 100644 +--- a/lib/vsprintf.c ++++ b/lib/vsprintf.c +@@ -771,7 +771,7 @@ static char *ptr_to_id(char *buf, char *end, const void *ptr, + return pointer_string(buf, end, (const void *)hashval, spec); + } + +-int kptr_restrict __read_mostly; ++int kptr_restrict __read_mostly = 2; + + static noinline_for_stack + char *restricted_pointer(char *buf, char *end, const void *ptr, +diff --git a/mm/Kconfig b/mm/Kconfig +index a5dae9a7eb51..0a3070c5a125 100644 +--- a/mm/Kconfig ++++ b/mm/Kconfig +@@ -303,7 +303,8 @@ config KSM + config DEFAULT_MMAP_MIN_ADDR + int "Low address space to protect from user allocation" + depends on MMU +- default 4096 ++ default 32768 if ARM || (ARM64 && COMPAT) ++ default 65536 + help + This is the portion of low virtual memory which should be protected + from userspace allocation. Keeping a user from writing to low pages +diff --git a/mm/mmap.c b/mm/mmap.c +index 4390dbea4aa5..076fd46af68c 100644 +--- a/mm/mmap.c ++++ b/mm/mmap.c +@@ -230,6 +230,13 @@ SYSCALL_DEFINE1(brk, unsigned long, brk) + + newbrk = PAGE_ALIGN(brk); + oldbrk = PAGE_ALIGN(mm->brk); ++ /* properly handle unaligned min_brk as an empty heap */ ++ if (min_brk & ~PAGE_MASK) { ++ if (brk == min_brk) ++ newbrk -= PAGE_SIZE; ++ if (mm->brk == min_brk) ++ oldbrk -= PAGE_SIZE; ++ } + if (oldbrk == newbrk) { + mm->brk = brk; + goto success; +diff --git a/mm/page_alloc.c b/mm/page_alloc.c +index 45e39131a716..78b4865f8a1c 100644 +--- a/mm/page_alloc.c ++++ b/mm/page_alloc.c +@@ -68,6 +68,7 @@ + #include + #include + #include ++#include + + #include + #include +@@ -106,6 +107,15 @@ struct pcpu_drain { + DEFINE_MUTEX(pcpu_drain_mutex); + DEFINE_PER_CPU(struct pcpu_drain, pcpu_drain); + ++bool __meminitdata extra_latent_entropy; ++ ++static int __init setup_extra_latent_entropy(char *str) ++{ ++ extra_latent_entropy = true; ++ return 0; ++} ++early_param("extra_latent_entropy", setup_extra_latent_entropy); ++ + #ifdef CONFIG_GCC_PLUGIN_LATENT_ENTROPY + volatile unsigned long latent_entropy __latent_entropy; + EXPORT_SYMBOL(latent_entropy); +@@ -1427,6 +1437,25 @@ static void __free_pages_ok(struct page *page, unsigned int order) + local_irq_restore(flags); + } + ++static void __init __gather_extra_latent_entropy(struct page *page, ++ unsigned int nr_pages) ++{ ++ if (extra_latent_entropy && !PageHighMem(page) && page_to_pfn(page) < 0x100000) { ++ unsigned long hash = 0; ++ size_t index, end = PAGE_SIZE * nr_pages / sizeof hash; ++ const unsigned long *data = lowmem_page_address(page); ++ ++ for (index = 0; index < end; index++) ++ hash ^= hash + data[index]; ++#ifdef CONFIG_GCC_PLUGIN_LATENT_ENTROPY ++ latent_entropy ^= hash; ++ add_device_randomness((const void *)&latent_entropy, sizeof(latent_entropy)); ++#else ++ add_device_randomness((const void *)&hash, sizeof(hash)); ++#endif ++ } ++} ++ + void __free_pages_core(struct page *page, unsigned int order) + { + unsigned int nr_pages = 1 << order; +@@ -1441,7 +1470,6 @@ void __free_pages_core(struct page *page, unsigned int order) + } + __ClearPageReserved(p); + set_page_count(p, 0); +- + atomic_long_add(nr_pages, &page_zone(page)->managed_pages); + set_page_refcounted(page); + __free_pages(page, order); +@@ -1492,6 +1520,7 @@ void __init memblock_free_pages(struct page *page, unsigned long pfn, + { + if (early_page_uninitialised(pfn)) + return; ++ __gather_extra_latent_entropy(page, 1 << order); + __free_pages_core(page, order); + } + +@@ -1582,6 +1611,7 @@ static void __init deferred_free_range(unsigned long pfn, + if (nr_pages == pageblock_nr_pages && + (pfn & (pageblock_nr_pages - 1)) == 0) { + set_pageblock_migratetype(page, MIGRATE_MOVABLE); ++ __gather_extra_latent_entropy(page, 1 << pageblock_order); + __free_pages_core(page, pageblock_order); + return; + } +@@ -1589,6 +1619,7 @@ static void __init deferred_free_range(unsigned long pfn, + for (i = 0; i < nr_pages; i++, page++, pfn++) { + if ((pfn & (pageblock_nr_pages - 1)) == 0) + set_pageblock_migratetype(page, MIGRATE_MOVABLE); ++ __gather_extra_latent_entropy(page, 1); + __free_pages_core(page, 0); + } + } +@@ -2156,6 +2187,12 @@ static void prep_new_page(struct page *page, unsigned int order, gfp_t gfp_flags + { + post_alloc_hook(page, order, gfp_flags); + ++ if (IS_ENABLED(CONFIG_PAGE_SANITIZE_VERIFY) && want_init_on_free()) { ++ int i; ++ for (i = 0; i < (1 << order); i++) ++ verify_zero_highpage(page + i); ++ } ++ + if (!free_pages_prezeroed() && want_init_on_alloc(gfp_flags)) + kernel_init_free_pages(page, 1 << order); + +diff --git a/mm/slab.h b/mm/slab.h +index b2b01694dc43..b531661095a2 100644 +--- a/mm/slab.h ++++ b/mm/slab.h +@@ -470,9 +470,13 @@ static inline struct kmem_cache *virt_to_cache(const void *obj) + struct page *page; + + page = virt_to_head_page(obj); ++#ifdef CONFIG_BUG_ON_DATA_CORRUPTION ++ BUG_ON(!PageSlab(page)); ++#else + if (WARN_ONCE(!PageSlab(page), "%s: Object is not a Slab page!\n", + __func__)) + return NULL; ++#endif + return page->slab_cache; + } + +@@ -518,9 +522,14 @@ static inline struct kmem_cache *cache_from_obj(struct kmem_cache *s, void *x) + return s; + + cachep = virt_to_cache(x); +- WARN_ONCE(cachep && !slab_equal_or_root(cachep, s), +- "%s: Wrong slab cache. %s but object is from %s\n", +- __func__, s->name, cachep->name); ++ if (cachep && !slab_equal_or_root(cachep, s)) { ++#ifdef CONFIG_BUG_ON_DATA_CORRUPTION ++ BUG(); ++#else ++ WARN_ONCE(1, "%s: Wrong slab cache. %s but object is from %s\n", ++ __func__, s->name, cachep->name); ++#endif ++ } + return cachep; + } + +@@ -545,7 +554,7 @@ static inline size_t slab_ksize(const struct kmem_cache *s) + * back there or track user information then we can + * only use the space before that information. + */ +- if (s->flags & (SLAB_TYPESAFE_BY_RCU | SLAB_STORE_USER)) ++ if ((s->flags & (SLAB_TYPESAFE_BY_RCU | SLAB_STORE_USER)) || IS_ENABLED(CONFIG_SLAB_CANARY)) + return s->inuse; + /* + * Else we can use all the padding etc for the allocation +@@ -674,8 +683,10 @@ static inline void cache_random_seq_destroy(struct kmem_cache *cachep) { } + static inline bool slab_want_init_on_alloc(gfp_t flags, struct kmem_cache *c) + { + if (static_branch_unlikely(&init_on_alloc)) { ++#ifndef CONFIG_SLUB + if (c->ctor) + return false; ++#endif + if (c->flags & (SLAB_TYPESAFE_BY_RCU | SLAB_POISON)) + return flags & __GFP_ZERO; + return true; +@@ -685,9 +696,15 @@ static inline bool slab_want_init_on_alloc(gfp_t flags, struct kmem_cache *c) + + static inline bool slab_want_init_on_free(struct kmem_cache *c) + { +- if (static_branch_unlikely(&init_on_free)) +- return !(c->ctor || +- (c->flags & (SLAB_TYPESAFE_BY_RCU | SLAB_POISON))); ++ if (static_branch_unlikely(&init_on_free)) { ++#ifndef CONFIG_SLUB ++ if (c->ctor) ++ return false; ++#endif ++ if (c->flags & (SLAB_TYPESAFE_BY_RCU | SLAB_POISON)) ++ return false; ++ return true; ++ } + return false; + } + +diff --git a/mm/slab_common.c b/mm/slab_common.c +index ade6c257d4b4..f8f9ebd51296 100644 +--- a/mm/slab_common.c ++++ b/mm/slab_common.c +@@ -28,10 +28,10 @@ + + #include "slab.h" + +-enum slab_state slab_state; ++enum slab_state slab_state __ro_after_init; + LIST_HEAD(slab_caches); + DEFINE_MUTEX(slab_mutex); +-struct kmem_cache *kmem_cache; ++struct kmem_cache *kmem_cache __ro_after_init; + + #ifdef CONFIG_HARDENED_USERCOPY + bool usercopy_fallback __ro_after_init = +@@ -59,7 +59,7 @@ static DECLARE_WORK(slab_caches_to_rcu_destroy_work, + /* + * Merge control. If this is set then no merging of slab caches will occur. + */ +-static bool slab_nomerge = !IS_ENABLED(CONFIG_SLAB_MERGE_DEFAULT); ++static bool slab_nomerge __ro_after_init = !IS_ENABLED(CONFIG_SLAB_MERGE_DEFAULT); + + static int __init setup_slab_nomerge(char *str) + { +diff --git a/mm/slub.c b/mm/slub.c +index 20d72cb20515..6690bce322a4 100644 +--- a/mm/slub.c ++++ b/mm/slub.c +@@ -125,6 +125,12 @@ static inline int kmem_cache_debug(struct kmem_cache *s) + #endif + } + ++static inline bool has_sanitize_verify(struct kmem_cache *s) ++{ ++ return IS_ENABLED(CONFIG_SLAB_SANITIZE_VERIFY) && ++ slab_want_init_on_free(s); ++} ++ + void *fixup_red_left(struct kmem_cache *s, void *p) + { + if (kmem_cache_debug(s) && s->flags & SLAB_RED_ZONE) +@@ -309,6 +315,35 @@ static inline void set_freepointer(struct kmem_cache *s, void *object, void *fp) + *(void **)freeptr_addr = freelist_ptr(s, fp, freeptr_addr); + } + ++#ifdef CONFIG_SLAB_CANARY ++static inline unsigned long *get_canary(struct kmem_cache *s, void *object) ++{ ++ if (s->offset) ++ return object + s->offset + sizeof(void *); ++ return object + s->inuse; ++} ++ ++static inline unsigned long get_canary_value(const void *canary, unsigned long value) ++{ ++ return (value ^ (unsigned long)canary) & CANARY_MASK; ++} ++ ++static inline void set_canary(struct kmem_cache *s, void *object, unsigned long value) ++{ ++ unsigned long *canary = get_canary(s, object); ++ *canary = get_canary_value(canary, value); ++} ++ ++static inline void check_canary(struct kmem_cache *s, void *object, unsigned long value) ++{ ++ unsigned long *canary = get_canary(s, object); ++ BUG_ON(*canary != get_canary_value(canary, value)); ++} ++#else ++#define set_canary(s, object, value) ++#define check_canary(s, object, value) ++#endif ++ + /* Loop over all objects in a slab */ + #define for_each_object(__p, __s, __addr, __objects) \ + for (__p = fixup_red_left(__s, __addr); \ +@@ -476,13 +511,13 @@ static inline void *restore_red_left(struct kmem_cache *s, void *p) + * Debug settings: + */ + #if defined(CONFIG_SLUB_DEBUG_ON) +-static slab_flags_t slub_debug = DEBUG_DEFAULT_FLAGS; ++static slab_flags_t slub_debug __ro_after_init = DEBUG_DEFAULT_FLAGS; + #else +-static slab_flags_t slub_debug; ++static slab_flags_t slub_debug __ro_after_init; + #endif + +-static char *slub_debug_slabs; +-static int disable_higher_order_debug; ++static char *slub_debug_slabs __ro_after_init; ++static int disable_higher_order_debug __ro_after_init; + + /* + * slub is about to manipulate internal object metadata. This memory lies +@@ -543,6 +578,9 @@ static struct track *get_track(struct kmem_cache *s, void *object, + else + p = object + s->inuse; + ++ if (IS_ENABLED(CONFIG_SLAB_CANARY)) ++ p = (void *)p + sizeof(void *); ++ + return p + alloc; + } + +@@ -673,6 +711,9 @@ static void print_trailer(struct kmem_cache *s, struct page *page, u8 *p) + else + off = s->inuse; + ++ if (IS_ENABLED(CONFIG_SLAB_CANARY)) ++ off += sizeof(void *); ++ + if (s->flags & SLAB_STORE_USER) + off += 2 * sizeof(struct track); + +@@ -802,6 +843,9 @@ static int check_pad_bytes(struct kmem_cache *s, struct page *page, u8 *p) + /* Freepointer is placed after the object. */ + off += sizeof(void *); + ++ if (IS_ENABLED(CONFIG_SLAB_CANARY)) ++ off += sizeof(void *); ++ + if (s->flags & SLAB_STORE_USER) + /* We also have user information there */ + off += 2 * sizeof(struct track); +@@ -1441,6 +1485,8 @@ static inline bool slab_free_freelist_hook(struct kmem_cache *s, + object = next; + next = get_freepointer(s, object); + ++ check_canary(s, object, s->random_active); ++ + if (slab_want_init_on_free(s)) { + /* + * Clear the object and the metadata, but don't touch +@@ -1451,8 +1497,12 @@ static inline bool slab_free_freelist_hook(struct kmem_cache *s, + : 0; + memset((char *)object + s->inuse, 0, + s->size - s->inuse - rsize); +- ++ if (!IS_ENABLED(CONFIG_SLAB_SANITIZE_VERIFY) && s->ctor) ++ s->ctor(object); + } ++ ++ set_canary(s, object, s->random_inactive); ++ + /* If object's reuse doesn't have to be delayed */ + if (!slab_free_hook(s, object)) { + /* Move object to the new freelist */ +@@ -1460,6 +1510,17 @@ static inline bool slab_free_freelist_hook(struct kmem_cache *s, + *head = object; + if (!*tail) + *tail = object; ++ } else if (slab_want_init_on_free(s) && s->ctor) { ++ /* Objects that are put into quarantine by KASAN will ++ * still undergo free_consistency_checks() and thus ++ * need to show a valid freepointer to check_object(). ++ * ++ * Note that doing this for all caches (not just ctor ++ * ones, which have s->offset != NULL)) causes a GPF, ++ * due to KASAN poisoning and the way set_freepointer() ++ * eventually dereferences the freepointer. ++ */ ++ set_freepointer(s, object, NULL); + } + } while (object != old_tail); + +@@ -1473,8 +1534,9 @@ static void *setup_object(struct kmem_cache *s, struct page *page, + void *object) + { + setup_object_debug(s, page, object); ++ set_canary(s, object, s->random_inactive); + object = kasan_init_slab_obj(s, object); +- if (unlikely(s->ctor)) { ++ if (unlikely(s->ctor) && !has_sanitize_verify(s)) { + kasan_unpoison_object_data(s, object); + s->ctor(object); + kasan_poison_object_data(s, object); +@@ -2752,8 +2814,28 @@ static __always_inline void *slab_alloc_node(struct kmem_cache *s, + + maybe_wipe_obj_freeptr(s, object); + +- if (unlikely(slab_want_init_on_alloc(gfpflags, s)) && object) ++ if (has_sanitize_verify(s) && object) { ++ /* KASAN hasn't unpoisoned the object yet (this is done in the ++ * post-alloc hook), so let's do it temporarily. ++ */ ++ kasan_unpoison_object_data(s, object); ++ BUG_ON(memchr_inv(object, 0, s->object_size)); ++ if (s->ctor) ++ s->ctor(object); ++ kasan_poison_object_data(s, object); ++ } else if (unlikely(slab_want_init_on_alloc(gfpflags, s)) && object) { + memset(object, 0, s->object_size); ++ if (s->ctor) { ++ kasan_unpoison_object_data(s, object); ++ s->ctor(object); ++ kasan_poison_object_data(s, object); ++ } ++ } ++ ++ if (object) { ++ check_canary(s, object, s->random_inactive); ++ set_canary(s, object, s->random_active); ++ } + + slab_post_alloc_hook(s, gfpflags, 1, &object); + +@@ -3136,7 +3218,7 @@ int kmem_cache_alloc_bulk(struct kmem_cache *s, gfp_t flags, size_t size, + void **p) + { + struct kmem_cache_cpu *c; +- int i; ++ int i, k; + + /* memcg and kmem_cache debug support */ + s = slab_pre_alloc_hook(s, flags); +@@ -3176,11 +3258,35 @@ int kmem_cache_alloc_bulk(struct kmem_cache *s, gfp_t flags, size_t size, + local_irq_enable(); + + /* Clear memory outside IRQ disabled fastpath loop */ +- if (unlikely(slab_want_init_on_alloc(flags, s))) { ++ if (has_sanitize_verify(s)) { + int j; + +- for (j = 0; j < i; j++) ++ for (j = 0; j < i; j++) { ++ /* KASAN hasn't unpoisoned the object yet (this is done ++ * in the post-alloc hook), so let's do it temporarily. ++ */ ++ kasan_unpoison_object_data(s, p[j]); ++ BUG_ON(memchr_inv(p[j], 0, s->object_size)); ++ if (s->ctor) ++ s->ctor(p[j]); ++ kasan_poison_object_data(s, p[j]); ++ } ++ } else if (unlikely(slab_want_init_on_alloc(flags, s))) { ++ int j; ++ ++ for (j = 0; j < i; j++) { + memset(p[j], 0, s->object_size); ++ if (s->ctor) { ++ kasan_unpoison_object_data(s, p[j]); ++ s->ctor(p[j]); ++ kasan_poison_object_data(s, p[j]); ++ } ++ } ++ } ++ ++ for (k = 0; k < i; k++) { ++ check_canary(s, p[k], s->random_inactive); ++ set_canary(s, p[k], s->random_active); + } + + /* memcg and kmem_cache debug support */ +@@ -3214,9 +3320,9 @@ EXPORT_SYMBOL(kmem_cache_alloc_bulk); + * and increases the number of allocations possible without having to + * take the list_lock. + */ +-static unsigned int slub_min_order; +-static unsigned int slub_max_order = PAGE_ALLOC_COSTLY_ORDER; +-static unsigned int slub_min_objects; ++static unsigned int slub_min_order __ro_after_init; ++static unsigned int slub_max_order __ro_after_init = PAGE_ALLOC_COSTLY_ORDER; ++static unsigned int slub_min_objects __ro_after_init; + + /* + * Calculate the order of allocation given an slab object size. +@@ -3384,6 +3490,7 @@ static void early_kmem_cache_node_alloc(int node) + init_object(kmem_cache_node, n, SLUB_RED_ACTIVE); + init_tracking(kmem_cache_node, n); + #endif ++ set_canary(kmem_cache_node, n, kmem_cache_node->random_active); + n = kasan_kmalloc(kmem_cache_node, n, sizeof(struct kmem_cache_node), + GFP_KERNEL); + page->freelist = get_freepointer(kmem_cache_node, n); +@@ -3544,6 +3651,9 @@ static int calculate_sizes(struct kmem_cache *s, int forced_order) + size += sizeof(void *); + } + ++ if (IS_ENABLED(CONFIG_SLAB_CANARY)) ++ size += sizeof(void *); ++ + #ifdef CONFIG_SLUB_DEBUG + if (flags & SLAB_STORE_USER) + /* +@@ -3616,6 +3726,10 @@ static int kmem_cache_open(struct kmem_cache *s, slab_flags_t flags) + #ifdef CONFIG_SLAB_FREELIST_HARDENED + s->random = get_random_long(); + #endif ++#ifdef CONFIG_SLAB_CANARY ++ s->random_active = get_random_long(); ++ s->random_inactive = get_random_long(); ++#endif + + if (!calculate_sizes(s, -1)) + goto error; +@@ -3891,6 +4005,8 @@ void __check_heap_object(const void *ptr, unsigned long n, struct page *page, + offset -= s->red_left_pad; + } + ++ check_canary(s, (void *)ptr - offset, s->random_active); ++ + /* Allow address range falling entirely within usercopy region. */ + if (offset >= s->useroffset && + offset - s->useroffset <= s->usersize && +@@ -3924,7 +4040,11 @@ size_t __ksize(const void *object) + page = virt_to_head_page(object); + + if (unlikely(!PageSlab(page))) { ++#ifdef CONFIG_BUG_ON_DATA_CORRUPTION ++ BUG_ON(!PageCompound(page)); ++#else + WARN_ON(!PageCompound(page)); ++#endif + return page_size(page); + } + +@@ -4769,7 +4889,7 @@ enum slab_stat_type { + #define SO_TOTAL (1 << SL_TOTAL) + + #ifdef CONFIG_MEMCG +-static bool memcg_sysfs_enabled = IS_ENABLED(CONFIG_SLUB_MEMCG_SYSFS_ON); ++static bool memcg_sysfs_enabled __ro_after_init = IS_ENABLED(CONFIG_SLUB_MEMCG_SYSFS_ON); + + static int __init setup_slub_memcg_sysfs(char *str) + { +diff --git a/mm/swap.c b/mm/swap.c +index 38c3fa4308e2..0534c2e348c2 100644 +--- a/mm/swap.c ++++ b/mm/swap.c +@@ -94,6 +94,13 @@ static void __put_compound_page(struct page *page) + if (!PageHuge(page)) + __page_cache_release(page); + dtor = get_compound_page_dtor(page); ++ if (!PageHuge(page)) ++ BUG_ON(dtor != free_compound_page ++#ifdef CONFIG_TRANSPARENT_HUGEPAGE ++ && dtor != free_transhuge_page ++#endif ++ ); ++ + (*dtor)(page); + } + +diff --git a/mm/util.c b/mm/util.c +index 3ad6db9a722e..80209685f67c 100644 +--- a/mm/util.c ++++ b/mm/util.c +@@ -325,9 +325,9 @@ unsigned long arch_randomize_brk(struct mm_struct *mm) + { + /* Is the current task 32bit ? */ + if (!IS_ENABLED(CONFIG_64BIT) || is_compat_task()) +- return randomize_page(mm->brk, SZ_32M); ++ return mm->brk + get_random_long() % SZ_32M + PAGE_SIZE; + +- return randomize_page(mm->brk, SZ_1G); ++ return mm->brk + get_random_long() % SZ_1G + PAGE_SIZE; + } + + unsigned long arch_mmap_rnd(void) +diff --git a/net/core/dev.c b/net/core/dev.c +index 3098c90d60e2..08de516adfd5 100644 +--- a/net/core/dev.c ++++ b/net/core/dev.c +@@ -4492,7 +4492,7 @@ int netif_rx_ni(struct sk_buff *skb) + } + EXPORT_SYMBOL(netif_rx_ni); + +-static __latent_entropy void net_tx_action(struct softirq_action *h) ++static __latent_entropy void net_tx_action(void) + { + struct softnet_data *sd = this_cpu_ptr(&softnet_data); + +@@ -6353,7 +6353,7 @@ static int napi_poll(struct napi_struct *n, struct list_head *repoll) + return work; + } + +-static __latent_entropy void net_rx_action(struct softirq_action *h) ++static __latent_entropy void net_rx_action(void) + { + struct softnet_data *sd = this_cpu_ptr(&softnet_data); + unsigned long time_limit = jiffies + +diff --git a/net/ipv4/Kconfig b/net/ipv4/Kconfig +index 03381f3e12ba..8ea409f37436 100644 +--- a/net/ipv4/Kconfig ++++ b/net/ipv4/Kconfig +@@ -267,6 +267,7 @@ config IP_PIMSM_V2 + + config SYN_COOKIES + bool "IP: TCP syncookie support" ++ default y + ---help--- + Normal TCP/IP networking is open to an attack known as "SYN + flooding". This denial-of-service attack prevents legitimate remote +diff --git a/scripts/Makefile.modpost b/scripts/Makefile.modpost +index 952fff485546..59ffccdb1be4 100644 +--- a/scripts/Makefile.modpost ++++ b/scripts/Makefile.modpost +@@ -54,6 +54,7 @@ MODPOST = scripts/mod/modpost \ + $(if $(KBUILD_EXTMOD),$(addprefix -e ,$(KBUILD_EXTRA_SYMBOLS))) \ + $(if $(KBUILD_EXTMOD),-o $(modulesymfile)) \ + $(if $(CONFIG_SECTION_MISMATCH_WARN_ONLY),,-E) \ ++ $(if $(CONFIG_DEBUG_WRITABLE_FUNCTION_POINTERS_VERBOSE),-f) \ + $(if $(KBUILD_MODPOST_WARN),-w) \ + $(if $(filter nsdeps,$(MAKECMDGOALS)),-d) + +diff --git a/scripts/gcc-plugins/Kconfig b/scripts/gcc-plugins/Kconfig +index e3569543bdac..55cc439b3bc6 100644 +--- a/scripts/gcc-plugins/Kconfig ++++ b/scripts/gcc-plugins/Kconfig +@@ -61,6 +61,11 @@ config GCC_PLUGIN_LATENT_ENTROPY + is some slowdown of the boot process (about 0.5%) and fork and + irq processing. + ++ When extra_latent_entropy is passed on the kernel command line, ++ entropy will be extracted from up to the first 4GB of RAM while the ++ runtime memory allocator is being initialized. This costs even more ++ slowdown of the boot process. ++ + Note that entropy extracted this way is not cryptographically + secure! + +diff --git a/scripts/mod/modpost.c b/scripts/mod/modpost.c +index d2a30a7b3f07..ff57a5fe8029 100644 +--- a/scripts/mod/modpost.c ++++ b/scripts/mod/modpost.c +@@ -36,6 +36,8 @@ static int warn_unresolved = 0; + /* How a symbol is exported */ + static int sec_mismatch_count = 0; + static int sec_mismatch_fatal = 0; ++static int writable_fptr_count = 0; ++static int writable_fptr_verbose = 0; + /* ignore missing files */ + static int ignore_missing_files; + /* write namespace dependencies */ +@@ -1019,6 +1021,7 @@ enum mismatch { + ANY_EXIT_TO_ANY_INIT, + EXPORT_TO_INIT_EXIT, + EXTABLE_TO_NON_TEXT, ++ DATA_TO_TEXT + }; + + /** +@@ -1145,6 +1148,12 @@ static const struct sectioncheck sectioncheck[] = { + .good_tosec = {ALL_TEXT_SECTIONS , NULL}, + .mismatch = EXTABLE_TO_NON_TEXT, + .handler = extable_mismatch_handler, ++}, ++/* Do not reference code from writable data */ ++{ ++ .fromsec = { DATA_SECTIONS, NULL }, ++ .bad_tosec = { ALL_TEXT_SECTIONS, NULL }, ++ .mismatch = DATA_TO_TEXT + } + }; + +@@ -1332,10 +1341,10 @@ static Elf_Sym *find_elf_symbol(struct elf_info *elf, Elf64_Sword addr, + continue; + if (!is_valid_name(elf, sym)) + continue; +- if (sym->st_value == addr) +- return sym; + /* Find a symbol nearby - addr are maybe negative */ + d = sym->st_value - addr; ++ if (d == 0) ++ return sym; + if (d < 0) + d = addr - sym->st_value; + if (d < distance) { +@@ -1470,7 +1479,13 @@ static void report_sec_mismatch(const char *modname, + char *prl_from; + char *prl_to; + +- sec_mismatch_count++; ++ if (mismatch->mismatch == DATA_TO_TEXT) { ++ writable_fptr_count++; ++ if (!writable_fptr_verbose) ++ return; ++ } else { ++ sec_mismatch_count++; ++ } + + get_pretty_name(from_is_func, &from, &from_p); + get_pretty_name(to_is_func, &to, &to_p); +@@ -1592,6 +1607,12 @@ static void report_sec_mismatch(const char *modname, + fatal("There's a special handler for this mismatch type, " + "we should never get here."); + break; ++ case DATA_TO_TEXT: ++ fprintf(stderr, ++ "The %s %s:%s references\n" ++ "the %s %s:%s%s\n", ++ from, fromsec, fromsym, to, tosec, tosym, to_p); ++ break; + } + fprintf(stderr, "\n"); + } +@@ -2569,7 +2590,7 @@ int main(int argc, char **argv) + struct ext_sym_list *extsym_iter; + struct ext_sym_list *extsym_start = NULL; + +- while ((opt = getopt(argc, argv, "i:I:e:mnsT:o:awEd")) != -1) { ++ while ((opt = getopt(argc, argv, "i:I:e:fmnsT:o:awEd")) != -1) { + switch (opt) { + case 'i': + kernel_read = optarg; +@@ -2586,6 +2607,9 @@ int main(int argc, char **argv) + extsym_iter->file = optarg; + extsym_start = extsym_iter; + break; ++ case 'f': ++ writable_fptr_verbose = 1; ++ break; + case 'm': + modversions = 1; + break; +@@ -2692,6 +2716,11 @@ int main(int argc, char **argv) + } + + free(buf.p); ++ if (writable_fptr_count && !writable_fptr_verbose) ++ warn("modpost: Found %d writable function pointer%s.\n" ++ "To see full details build your kernel with:\n" ++ "'make CONFIG_DEBUG_WRITABLE_FUNCTION_POINTERS_VERBOSE=y'\n", ++ writable_fptr_count, (writable_fptr_count == 1 ? "" : "s")); + + return err; + } +diff --git a/security/Kconfig b/security/Kconfig +index 2a1a2d396228..3b7a71410f88 100644 +--- a/security/Kconfig ++++ b/security/Kconfig +@@ -9,7 +9,7 @@ source "security/keys/Kconfig" + + config SECURITY_DMESG_RESTRICT + bool "Restrict unprivileged access to the kernel syslog" +- default n ++ default y + help + This enforces restrictions on unprivileged users reading the kernel + syslog via dmesg(8). +@@ -19,10 +19,34 @@ config SECURITY_DMESG_RESTRICT + + If you are unsure how to answer this question, answer N. + ++config SECURITY_PERF_EVENTS_RESTRICT ++ bool "Restrict unprivileged use of performance events" ++ depends on PERF_EVENTS ++ default y ++ help ++ If you say Y here, the kernel.perf_event_paranoid sysctl ++ will be set to 3 by default, and no unprivileged use of the ++ perf_event_open syscall will be permitted unless it is ++ changed. ++ ++config SECURITY_TIOCSTI_RESTRICT ++ bool "Restrict unprivileged use of tiocsti command injection" ++ default y ++ help ++ This enforces restrictions on unprivileged users injecting commands ++ into other processes which share a tty session using the TIOCSTI ++ ioctl. This option makes TIOCSTI use require CAP_SYS_ADMIN. ++ ++ If this option is not selected, no restrictions will be enforced ++ unless the tiocsti_restrict sysctl is explicitly set to (1). ++ ++ If you are unsure how to answer this question, answer N. ++ + config SECURITY + bool "Enable different security models" + depends on SYSFS + depends on MULTIUSER ++ default y + help + This allows you to choose different security modules to be + configured into your kernel. +@@ -48,6 +72,7 @@ config SECURITYFS + config SECURITY_NETWORK + bool "Socket and Networking Security Hooks" + depends on SECURITY ++ default y + help + This enables the socket and networking security hooks. + If enabled, a security module can use these hooks to +@@ -154,6 +179,7 @@ config HARDENED_USERCOPY + bool "Harden memory copies between kernel and userspace" + depends on HAVE_HARDENED_USERCOPY_ALLOCATOR + imply STRICT_DEVMEM ++ default y + help + This option checks for obviously wrong memory regions when + copying memory to/from the kernel (via copy_to_user() and +@@ -166,7 +192,6 @@ config HARDENED_USERCOPY + config HARDENED_USERCOPY_FALLBACK + bool "Allow usercopy whitelist violations to fallback to object size" + depends on HARDENED_USERCOPY +- default y + help + This is a temporary option that allows missing usercopy whitelists + to be discovered via a WARN() to the kernel log, instead of +@@ -191,10 +216,21 @@ config HARDENED_USERCOPY_PAGESPAN + config FORTIFY_SOURCE + bool "Harden common str/mem functions against buffer overflows" + depends on ARCH_HAS_FORTIFY_SOURCE ++ default y + help + Detect overflows of buffers in common string and memory functions + where the compiler can determine and validate the buffer sizes. + ++config FORTIFY_SOURCE_STRICT_STRING ++ bool "Harden common functions against buffer overflows" ++ depends on FORTIFY_SOURCE ++ depends on EXPERT ++ help ++ Perform stricter overflow checks catching overflows within objects ++ for common C string functions rather than only between objects. ++ ++ This is not yet intended for production use, only bug finding. ++ + config STATIC_USERMODEHELPER + bool "Force all usermode helper calls through a single binary" + help +diff --git a/security/Kconfig.hardening b/security/Kconfig.hardening +index af4c979b38ee..473e40bb8537 100644 +--- a/security/Kconfig.hardening ++++ b/security/Kconfig.hardening +@@ -169,6 +169,7 @@ config STACKLEAK_RUNTIME_DISABLE + + config INIT_ON_ALLOC_DEFAULT_ON + bool "Enable heap memory zeroing on allocation by default" ++ default yes + help + This has the effect of setting "init_on_alloc=1" on the kernel + command line. This can be disabled with "init_on_alloc=0". +@@ -181,6 +182,7 @@ config INIT_ON_ALLOC_DEFAULT_ON + + config INIT_ON_FREE_DEFAULT_ON + bool "Enable heap memory zeroing on free by default" ++ default yes + help + This has the effect of setting "init_on_free=1" on the kernel + command line. This can be disabled with "init_on_free=0". +@@ -196,6 +198,20 @@ config INIT_ON_FREE_DEFAULT_ON + touching "cold" memory areas. Most cases see 3-5% impact. Some + synthetic workloads have measured as high as 8%. + ++config PAGE_SANITIZE_VERIFY ++ bool "Verify sanitized pages" ++ default y ++ help ++ When init_on_free is enabled, verify that newly allocated pages ++ are zeroed to detect write-after-free bugs. ++ ++config SLAB_SANITIZE_VERIFY ++ default y ++ bool "Verify sanitized SLAB allocations" ++ help ++ When init_on_free is enabled, verify that newly allocated slab ++ objects are zeroed to detect write-after-free bugs. ++ + endmenu + + endmenu +diff --git a/security/selinux/Kconfig b/security/selinux/Kconfig +index 5711689deb6a..fab0cb896907 100644 +--- a/security/selinux/Kconfig ++++ b/security/selinux/Kconfig +@@ -3,7 +3,7 @@ config SECURITY_SELINUX + bool "NSA SELinux Support" + depends on SECURITY_NETWORK && AUDIT && NET && INET + select NETWORK_SECMARK +- default n ++ default y + help + This selects NSA Security-Enhanced Linux (SELinux). + You will also need a policy configuration and a labeled filesystem. +@@ -65,23 +65,3 @@ config SECURITY_SELINUX_AVC_STATS + This option collects access vector cache statistics to + /selinux/avc/cache_stats, which may be monitored via + tools such as avcstat. +- +-config SECURITY_SELINUX_CHECKREQPROT_VALUE +- int "NSA SELinux checkreqprot default value" +- depends on SECURITY_SELINUX +- range 0 1 +- default 0 +- help +- This option sets the default value for the 'checkreqprot' flag +- that determines whether SELinux checks the protection requested +- by the application or the protection that will be applied by the +- kernel (including any implied execute for read-implies-exec) for +- mmap and mprotect calls. If this option is set to 0 (zero), +- SELinux will default to checking the protection that will be applied +- by the kernel. If this option is set to 1 (one), SELinux will +- default to checking the protection requested by the application. +- The checkreqprot flag may be changed from the default via the +- 'checkreqprot=' boot parameter. It may also be changed at runtime +- via /selinux/checkreqprot if authorized by policy. +- +- If you are unsure how to answer this question, answer 0. +diff --git a/security/selinux/hooks.c b/security/selinux/hooks.c +index 9625b99e677f..daa40da7a8f9 100644 +--- a/security/selinux/hooks.c ++++ b/security/selinux/hooks.c +@@ -135,18 +135,7 @@ static int __init selinux_enabled_setup(char *str) + __setup("selinux=", selinux_enabled_setup); + #endif + +-static unsigned int selinux_checkreqprot_boot = +- CONFIG_SECURITY_SELINUX_CHECKREQPROT_VALUE; +- +-static int __init checkreqprot_setup(char *str) +-{ +- unsigned long checkreqprot; +- +- if (!kstrtoul(str, 0, &checkreqprot)) +- selinux_checkreqprot_boot = checkreqprot ? 1 : 0; +- return 1; +-} +-__setup("checkreqprot=", checkreqprot_setup); ++static const unsigned int selinux_checkreqprot_boot; + + /** + * selinux_secmark_enabled - Check to see if SECMARK is currently enabled +diff --git a/security/selinux/selinuxfs.c b/security/selinux/selinuxfs.c +index e6c7643c3fc0..0e8217f72c5a 100644 +--- a/security/selinux/selinuxfs.c ++++ b/security/selinux/selinuxfs.c +@@ -639,7 +639,6 @@ static ssize_t sel_read_checkreqprot(struct file *filp, char __user *buf, + static ssize_t sel_write_checkreqprot(struct file *file, const char __user *buf, + size_t count, loff_t *ppos) + { +- struct selinux_fs_info *fsi = file_inode(file)->i_sb->s_fs_info; + char *page; + ssize_t length; + unsigned int new_value; +@@ -663,10 +662,9 @@ static ssize_t sel_write_checkreqprot(struct file *file, const char __user *buf, + return PTR_ERR(page); + + length = -EINVAL; +- if (sscanf(page, "%u", &new_value) != 1) ++ if (sscanf(page, "%u", &new_value) != 1 || new_value) + goto out; + +- fsi->state->checkreqprot = new_value ? 1 : 0; + length = count; + out: + kfree(page); +diff --git a/security/yama/Kconfig b/security/yama/Kconfig +index a810304123ca..b809050b25d2 100644 +--- a/security/yama/Kconfig ++++ b/security/yama/Kconfig +@@ -2,7 +2,7 @@ + config SECURITY_YAMA + bool "Yama support" + depends on SECURITY +- default n ++ default y + help + This selects Yama, which extends DAC support with additional + system-wide security settings beyond regular Linux discretionary diff --git a/linux55-tkg/PKGBUILD b/linux55-tkg/PKGBUILD new file mode 100644 index 0000000..bc399c1 --- /dev/null +++ b/linux55-tkg/PKGBUILD @@ -0,0 +1,1115 @@ +# Based on the file created for Arch Linux by: +# Tobias Powalowski +# Thomas Baechler + +# Contributor: Tk-Glitch + +plain ' .---.` `.---.' +plain ' `/syhhhyso- -osyhhhys/`' +plain ' .syNMdhNNhss/``.---.``/sshNNhdMNys.' +plain ' +sdMh.`+MNsssssssssssssssNM+`.hMds+' +plain ' :syNNdhNNhssssssssssssssshNNhdNNys:' +plain ' /ssyhhhysssssssssssssssssyhhhyss/' +plain ' .ossssssssssssssssssssssssssssso.' +plain ' :sssssssssssssssssssssssssssssssss:' +plain ' /sssssssssssssssssssssssssssssssssss/' +plain ' :sssssssssssssoosssssssoosssssssssssss:' +plain ' osssssssssssssoosssssssoossssssssssssso' +plain ' osssssssssssyyyyhhhhhhhyyyyssssssssssso' +plain ' /yyyyyyhhdmmmmNNNNNNNNNNNmmmmdhhyyyyyy/' +plain ' smmmNNNNNNNNNNNNNNNNNNNNNNNNNNNNNmmms' +plain ' /dNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNd/' +plain ' `:sdNNNNNNNNNNNNNNNNNNNNNNNNNds:`' +plain ' `-+shdNNNNNNNNNNNNNNNdhs+-`' +plain ' `.-:///////:-.`' + +_where=$PWD # track basedir as different Arch based distros are moving srcdir around + +cp "$_where"/linux55-tkg-patches/* "$_where" # copy patches inside the PKGBUILD's dir to preserve makepkg sourcing and md5sum checking +cp "$_where"/linux55-tkg-config/* "$_where" # copy config files and hooks inside the PKGBUILD's dir to preserve makepkg sourcing and md5sum checking + +source "$_where"/customization.cfg # load default configuration from file + +# Load external configuration file if present. Available variable values will overwrite customization.cfg ones. +if [ -e "$_EXT_CONFIG_PATH" ]; then + source "$_EXT_CONFIG_PATH" && msg2 "External configuration file $_EXT_CONFIG_PATH will be used to override customization.cfg values." && msg2 "" +fi + +if [ -z "$_OPTIPROFILE" ] && [ ! -e "$_where"/cpuschedset ]; then + # Prompt about optimized configurations. Available variable values will overwrite customization.cfg/external config ones. + plain "Do you want to use a predefined optimized profile?" + read -rp "`echo $' > 1.Custom\n 2.Ryzen Desktop (Performance)\n 3.Other Desktop (Performance)\nchoice[1-3?]: '`" _OPTIPROFILE; +fi +if [ "$_OPTIPROFILE" == "2" ]; then + source "$_where"/ryzen-desktop-profile.cfg && msg2 "Ryzen Desktop (Performance) profile will be used." && msg2 "" +elif [ "$_OPTIPROFILE" == "3" ]; then + source "$_where"/generic-desktop-profile.cfg && msg2 "Generic Desktop (Performance) profile will be used." && msg2 "" +fi + +# source cpuschedset early if present +if [ -e "$_where"/cpuschedset ]; then + source "$_where"/cpuschedset +fi + +# CPU SCHED selector +if [ -z "$_cpusched" ] && [ ! -e "$_where"/cpuschedset ]; then + plain "What CPU sched variant do you want to build/install?" + read -rp "`echo $' > 1.PDS\n 2.MuQSS\n 3.BMQ\n 4.CFS\nchoice[1-4?]: '`" CONDITION; + if [ "$CONDITION" == "2" ]; then + echo "_cpusched=\"MuQSS\"" > "$_where"/cpuschedset + elif [ "$CONDITION" == "3" ]; then + echo "_cpusched=\"bmq\"" > "$_where"/cpuschedset + elif [ "$CONDITION" == "4" ]; then + echo "_cpusched=\"cfs\"" > "$_where"/cpuschedset + else + echo "_cpusched=\"pds\"" > "$_where"/cpuschedset + fi + if [ -n "$_custom_pkgbase" ]; then + echo "_custom_pkgbase=\"${_custom_pkgbase}\"" >> "$_where"/cpuschedset + fi +elif [ "$_cpusched" == "muqss" ] || [ "$_cpusched" == "MuQSS" ]; then + echo "_cpusched=\"MuQSS\"" > "$_where"/cpuschedset +elif [ "$_cpusched" == "pds" ]; then + echo "_cpusched=\"pds\"" > "$_where"/cpuschedset +elif [ "$_cpusched" == "bmq" ]; then + echo "_cpusched=\"bmq\"" > "$_where"/cpuschedset +else + echo "_cpusched=\"cfs\"" > "$_where"/cpuschedset +fi + +source "$_where"/cpuschedset + +_basever=55 +if [ -n "$_custom_pkgbase" ]; then + pkgbase="${_custom_pkgbase}" +else + pkgbase=linux"${_basever}"-tkg-"${_cpusched}" +fi +pkgname=("${pkgbase}" "${pkgbase}-headers") +_basekernel=5.5 +_sub=13 +pkgver="${_basekernel}"."${_sub}" +pkgrel=28 +pkgdesc='Linux-tkg' +arch=('x86_64') # no i686 in here +url="http://www.kernel.org/" +license=('GPL2') +makedepends=('xmlto' 'docbook-xsl' 'kmod' 'inetutils' 'bc' 'libelf' 'patchutils' 'flex' 'python-sphinx' 'python-sphinx_rtd_theme' 'graphviz' 'imagemagick' 'git') +optdepends=('schedtool') +if pacman -Qq ccache &> /dev/null; then + msg2 'ccache was found and will be used' + options=('!strip' 'ccache') +else + msg2 'ccache was not found and will not be used' + options=('!strip') +fi +source=("https://www.kernel.org/pub/linux/kernel/v5.x/linux-${_basekernel}.tar.xz" + "https://www.kernel.org/pub/linux/kernel/v5.x/patch-${pkgver}.xz" + 'config.x86_64' # stock Arch config + #'config_hardened.x86_64' # hardened Arch config + # ARCH Patches + 0001-add-sysctl-to-disallow-unprivileged-CLONE_NEWUSER-by.patch + # TkG + 0002-clear-patches.patch + 0003-glitched-base.patch + 0003-glitched-cfs.patch + 0004-glitched-ondemand-muqss.patch + 0004-glitched-muqss.patch + 0004-5.5-ck1.patch + 0005-glitched-ondemand-pds.patch + 0005-glitched-pds.patch + 0005-v5.5_undead-pds099o.patch + 0006-add-acs-overrides_iommu.patch + 0007-v5.5-fsync.patch + #0008-5.5-bcachefs.patch + 0009-glitched-ondemand-bmq.patch + 0009-glitched-bmq.patch + 0009-bmq_v5.5-r3.patch + 0011-ZFS-fix.patch + #0012-linux-hardened.patch +) +sha256sums=('a6fbd4ee903c128367892c2393ee0d9657b6ed3ea90016d4dc6f1f6da20b2330' + 'a58dad931dda6eba7656551da73d1c452317617c8282c094fa4f646d9422993a' + 'a841aa011edf6bae0ffbe8ead8177e5056de5a6d7333bb96e16917903de4d868' + '31dc68e84aecfb7d069efb1305049122c65694676be8b955634abcf0675922a2' + 'd02bf5ca08fd610394b9d3a0c3b176d74af206f897dee826e5cbaec97bb4a4aa' + '8fa175556f5339bae1f2e4f30def3ed3b8568fbec54f6ac670f54522ffd41145' + '7058e57fd68367b029adc77f2a82928f1433daaf02c8c279cb2d13556c8804d7' + 'c605f638d74c61861ebdc36ebd4cb8b6475eae2f6273e1ccb2bbb3e10a2ec3fe' + 'bc69d6e5ee8172b0242c8fa72d13cfe2b8d2b6601468836908a7dfe8b78a3bbb' + 'ded9d281ac20def48de01f1a2f69c1c4c15ac05feca7cd30e5c819e376d721e9' + '62496f9ca788996181ef145f96ad26291282fcc3fb95cdc04080dcf84365be33' + '7fd8e776209dac98627453fda754bdf9aff4a09f27cb0b3766d7983612eb3c74' + '5255ec9387d85a1e1f50b5d7f6ac9983111b31f6e437153b7fab49bb31fdf000' + '90917e09bb06fbed6853efe9e52f8c2ba4066fca44accdf7608222212561104a' + '2d9260b80b43bbd605cf420d6bd53aa7262103dfd77196ba590ece5600b6dc0d' + 'e27ad5ff23a81b5be73a642db5186b447f336956a427d1300e8ccc49abf0dd74' + '965a517a283f265a012545fbb5cc9e516efc9f6166d2aa1baf7293a32a1086b7' + '1a647aa24074af0cc3a0ecbf8c720ab496be9fe1a62fab41c524eb6cc5dcccda' + '49262ce4a8089fa70275aad742fc914baa28d9c384f710c9a62f64796d13e104') + +export KBUILD_BUILD_HOST=archlinux +export KBUILD_BUILD_USER=$pkgbase +export KBUILD_BUILD_TIMESTAMP="$(date -Ru${SOURCE_DATE_EPOCH:+d @$SOURCE_DATE_EPOCH})" + +user_patcher() { + # To patch the user because all your base are belong to us + local _patches=("$_where"/*."${_userpatch_ext}revert") + if [ ${#_patches[@]} -ge 2 ] || [ -e "${_patches}" ]; then + if [ "$_user_patches_no_confirm" != "true" ]; then + msg2 "Found ${#_patches[@]} 'to revert' userpatches for ${_userpatch_target}:" + printf '%s\n' "${_patches[@]}" + read -rp "Do you want to install it/them? - Be careful with that ;)"$'\n> N/y : ' _CONDITION; + fi + if [ "$_CONDITION" == "y" ] || [ "$_user_patches_no_confirm" == "true" ]; then + for _f in "${_patches[@]}"; do + if [ -e "${_f}" ]; then + msg2 "######################################################" + msg2 "" + msg2 "Reverting your own ${_userpatch_target} patch ${_f}" + msg2 "" + msg2 "######################################################" + patch -Np1 -R < "${_f}" + echo "Reverted your own patch ${_f}" >> "$_where"/last_build_config.log + fi + done + fi + fi + + _patches=("$_where"/*."${_userpatch_ext}patch") + if [ ${#_patches[@]} -ge 2 ] || [ -e "${_patches}" ]; then + if [ "$_user_patches_no_confirm" != "true" ]; then + msg2 "Found ${#_patches[@]} userpatches for ${_userpatch_target}:" + printf '%s\n' "${_patches[@]}" + read -rp "Do you want to install it/them? - Be careful with that ;)"$'\n> N/y : ' _CONDITION; + fi + if [ "$_CONDITION" == "y" ] || [ "$_user_patches_no_confirm" == "true" ]; then + for _f in "${_patches[@]}"; do + if [ -e "${_f}" ]; then + msg2 "######################################################" + msg2 "" + msg2 "Applying your own ${_userpatch_target} patch ${_f}" + msg2 "" + msg2 "######################################################" + patch -Np1 < "${_f}" + echo "Applied your own patch ${_f}" >> "$_where"/last_build_config.log + fi + done + fi + fi +} + +prepare() { + rm -rf $pkgdir # Nuke the entire pkg folder so it'll get regenerated clean on next build + + ln -s "${_where}/customization.cfg" "${srcdir}" # workaround + + cd "${srcdir}/linux-${_basekernel}" + + msg2 "Setting version..." + scripts/setlocalversion --save-scmversion + echo "-$pkgrel-tkg-${_cpusched}" > localversion.10-pkgrel + echo "" > localversion.20-pkgname + + # add upstream patch + patch -p1 -i ../patch-"${pkgver}" + + # ARCH Patches + if [ "${_configfile}" == "config_hardened.x86_64" ] && [ "${_cpusched}" == "cfs" ]; then + msg2 "Using linux hardened patchset" + patch -Np1 -i ../0012-linux-hardened.patch + else + patch -Np1 -i ../0001-add-sysctl-to-disallow-unprivileged-CLONE_NEWUSER-by.patch + fi + + # TkG + patch -Np1 -i ../0002-clear-patches.patch + + patch -Np1 -i ../0003-glitched-base.patch + + if [ "${_cpusched}" == "MuQSS" ]; then + # MuQSS + patch -Np1 -i ../0004-5.5-ck1.patch + if [ "${_aggressive_ondemand}" == "true" ]; then + patch -Np1 -i ../0004-glitched-ondemand-muqss.patch + fi + patch -Np1 -i ../0004-glitched-muqss.patch + elif [ "${_cpusched}" == "pds" ]; then + # PDS-mq + patch -Np1 -i ../0005-v5.5_undead-pds099o.patch + if [ "${_aggressive_ondemand}" == "true" ]; then + patch -Np1 -i ../0005-glitched-ondemand-pds.patch + fi + patch -Np1 -i ../0005-glitched-pds.patch + elif [ "${_cpusched}" == "bmq" ]; then + # BMQ + patch -Np1 -i ../0009-bmq_v5.5-r3.patch + if [ "${_aggressive_ondemand}" == "true" ]; then + patch -Np1 -i ../0009-glitched-ondemand-bmq.patch + fi + patch -Np1 -i ../0009-glitched-bmq.patch + else + patch -Np1 -i ../0003-glitched-cfs.patch + fi + + if [ -z "${_configfile}" ]; then + _configfile="config.x86_64" + fi + + cat "${srcdir}/${_configfile}" > ./.config + + # Set some -tkg defaults + echo "# CONFIG_DYNAMIC_FAULT is not set" >> ./.config + sed -i -e 's/CONFIG_DEFAULT_FQ_CODEL=y/# CONFIG_DEFAULT_FQ_CODEL is not set/' ./.config + echo "CONFIG_DEFAULT_CAKE=y" >> ./.config + echo "CONFIG_NR_TTY_DEVICES=63" >> ./.config + echo "CONFIG_TP_SMAPI=m" >> ./.config + echo "CONFIG_RAID6_USE_PREFER_GEN=y" >> ./.config + echo "# CONFIG_NTP_PPS is not set" >> ./.config + sed -i -e 's/CONFIG_CRYPTO_LZ4=m/CONFIG_CRYPTO_LZ4=y/' ./.config + sed -i -e 's/CONFIG_CRYPTO_LZ4HC=m/CONFIG_CRYPTO_LZ4HC=y/' ./.config + sed -i -e 's/CONFIG_LZ4_COMPRESS=m/CONFIG_LZ4_COMPRESS=y/' ./.config + sed -i -e 's/CONFIG_LZ4HC_COMPRESS=m/CONFIG_LZ4HC_COMPRESS=y/' ./.config + #sed -i -e 's/CONFIG_RCU_BOOST_DELAY=500/CONFIG_RCU_BOOST_DELAY=0/' ./.config + sed -i -e 's/# CONFIG_CMDLINE_BOOL is not set/CONFIG_CMDLINE_BOOL=y/' ./.config + echo "CONFIG_CMDLINE=\"${_custom_commandline}\"" >> ./.config + echo "# CONFIG_CMDLINE_OVERRIDE is not set" >> ./.config + + if [ "$_font_autoselect" != "false" ]; then + sed -i -e 's/CONFIG_FONT_TER16x32=y/# CONFIG_FONT_TER16x32 is not set\nCONFIG_FONT_AUTOSELECT=y/' ./.config + fi + + # Inject cpuopts options + echo "# CONFIG_MK8SSE3 is not set" >> ./.config + echo "# CONFIG_MK10 is not set" >> ./.config + echo "# CONFIG_MBARCELONA is not set" >> ./.config + echo "# CONFIG_MBOBCAT is not set" >> ./.config + echo "# CONFIG_MJAGUAR is not set" >> ./.config + echo "# CONFIG_MBULLDOZER is not set" >> ./.config + echo "# CONFIG_MPILEDRIVER is not set" >> ./.config + echo "# CONFIG_MSTEAMROLLER is not set" >> ./.config + echo "# CONFIG_MEXCAVATOR is not set" >> ./.config + echo "# CONFIG_MZEN is not set" >> ./.config + echo "# CONFIG_MZEN2 is not set" >> ./.config + echo "# CONFIG_MATOM is not set" >> ./.config + echo "# CONFIG_MNEHALEM is not set" >> ./.config + echo "# CONFIG_MWESTMERE is not set" >> ./.config + echo "# CONFIG_MSILVERMONT is not set" >> ./.config + echo "# CONFIG_MSANDYBRIDGE is not set" >> ./.config + echo "# CONFIG_MIVYBRIDGE is not set" >> ./.config + echo "# CONFIG_MHASWELL is not set" >> ./.config + echo "# CONFIG_MBROADWELL is not set" >> ./.config + echo "# CONFIG_MSKYLAKE is not set" >> ./.config + echo "# CONFIG_MSKYLAKEX is not set" >> ./.config + echo "# CONFIG_MCANNONLAKE is not set" >> ./.config + echo "# CONFIG_MICELAKE is not set" >> ./.config + echo "# CONFIG_MGOLDMONT is not set" >> ./.config + echo "# CONFIG_MGOLDMONTPLUS is not set" >> ./.config + echo "# CONFIG_MCASCADELAKE is not set" >> ./.config + + # Disable some debugging + if [ "${_debugdisable}" == "true" ]; then + sed -i -e 's/CONFIG_SLUB_DEBUG=y/# CONFIG_SLUB_DEBUG is not set/' ./.config + sed -i -e 's/CONFIG_PM_DEBUG=y/# CONFIG_PM_DEBUG is not set/' ./.config + sed -i -e 's/CONFIG_PM_ADVANCED_DEBUG=y/# CONFIG_PM_ADVANCED_DEBUG is not set/' ./.config + sed -i -e 's/CONFIG_PM_SLEEP_DEBUG=y/# CONFIG_PM_SLEEP_DEBUG is not set/' ./.config + sed -i -e 's/CONFIG_ACPI_DEBUG=y/# CONFIG_ACPI_DEBUG is not set/' ./.config + sed -i -e 's/CONFIG_SCHED_DEBUG=y/# CONFIG_SCHED_DEBUG is not set/' ./.config + sed -i -e 's/CONFIG_DEBUG_PREEMPT=y/# CONFIG_DEBUG_PREEMPT is not set/' ./.config + fi + + if [ "${_cpusched}" == "MuQSS" ]; then + # MuQSS default config + echo "CONFIG_SCHED_MUQSS=y" >> ./.config + elif [ "${_cpusched}" == "pds" ]; then + # PDS default config + echo "CONFIG_SCHED_PDS=y" >> ./.config + elif [ "${_cpusched}" == "bmq" ]; then + # BMQ default config + echo "CONFIG_SCHED_BMQ=y" >> ./.config + fi + + if [ "${_cpusched}" == "MuQSS" ] || [ "${_cpusched}" == "pds" ] || [ "${_cpusched}" == "bmq" ]; then + # Disable CFS + sed -i -e 's/CONFIG_FAIR_GROUP_SCHED=y/# CONFIG_FAIR_GROUP_SCHED is not set/' ./.config + sed -i -e 's/CONFIG_CFS_BANDWIDTH=y/# CONFIG_CFS_BANDWIDTH is not set/' ./.config + # sched yield type + if [ -n "$_sched_yield_type" ]; then + CONDITION0="$_sched_yield_type" + else + plain "" + plain "CPU sched_yield_type - Choose what sort of yield sched_yield will perform." + plain "" + plain "For PDS and MuQSS:" + plain "0: No yield." + plain "1: Yield only to better priority/deadline tasks." + plain "2: Expire timeslice and recalculate deadline." + plain "" + plain "For BMQ (experimental) - No recommended value yet, so try for yourself x) :" + plain "0: No yield." + plain "1: Deboost and requeue task. (default)" + plain "2: Set rq skip task." + read -rp "`echo $'\n > 0. Recommended option for gaming on PDS and MuQSS - "tkg" default\n 1. Default, but can lead to stability issues on some platforms\n 2. Can be a good option with low rr_interval on MuQSS\n [0-2?]: '`" CONDITION0; + fi + if [ "$CONDITION0" == "1" ]; then + msg2 "Using default CPU sched yield type (1)" + elif [ "$CONDITION0" == "2" ]; then + sed -i -e 's/int sched_yield_type __read_mostly = 1;/int sched_yield_type __read_mostly = 2;/' ./kernel/sched/"${_cpusched}".c + else + sed -i -e 's/int sched_yield_type __read_mostly = 1;/int sched_yield_type __read_mostly = 0;/' ./kernel/sched/"${_cpusched}".c + fi + fi + + # Round Robin interval + if [ "${_cpusched}" == "MuQSS" ] || [ "${_cpusched}" == "pds" ] || [ "${_cpusched}" == "bmq" ]; then + if [ -n "$_rr_interval" ]; then + CONDITION1="$_rr_interval" + else + plain "" + plain "Round Robin interval is the longest duration two tasks with the same nice level will" + plain "be delayed for. When CPU time is requested by a task, it receives a time slice equal" + plain "to the rr_interval in addition to a virtual deadline. When using yield_type 2, a low" + plain "value can help offset the disadvantages of rescheduling a process that has yielded." + plain "" + plain "MuQSS default: 6ms" + plain "PDS default: 4ms" + plain "BMQ default: 2ms" + read -rp "`echo $'\n > 0.Keep defaults\n 1.2ms\n 2.4ms\n 3.6ms\n 4.8ms\n [0-4?]: '`" CONDITION1; + fi + if [ "$CONDITION1" == "1" ]; then + msg2 "Using 2ms rr_interval" + _rrvalue="2" + elif [ "$CONDITION1" == "2" ]; then + msg2 "Using 4ms rr_interval" + _rrvalue="4" + elif [ "$CONDITION1" == "3" ]; then + msg2 "Using 6ms rr_interval" + _rrvalue="6" + elif [ "$CONDITION1" == "4" ]; then + msg2 "Using 8ms rr_interval" + _rrvalue="8" + else + msg2 "Using default rr_interval" + _rrvalue="default" + fi + if [ "$_rrvalue" != "default" ]; then + if [ "${_cpusched}" == "MuQSS" ]; then + sed -i -e "s/int rr_interval __read_mostly = 6;/int rr_interval __read_mostly = ${_rrvalue};/" ./kernel/sched/"${_cpusched}".c + elif [ "${_cpusched}" == "pds" ]; then + sed -i -e "s/#define SCHED_DEFAULT_RR (4)/#define SCHED_DEFAULT_RR (${_rrvalue})/" ./kernel/sched/"${_cpusched}".c + elif [ "${_cpusched}" == "bmq" ]; then + sed -i -e "s/u64 sched_timeslice_ns __read_mostly = (4 * 1000 * 1000);/u64 sched_timeslice_ns __read_mostly = (${_rrvalue} * 1000 * 1000);/" ./kernel/sched/"${_cpusched}".c + fi + else + if [ "${_cpusched}" == "bmq" ]; then + sed -i -e "s/u64 sched_timeslice_ns __read_mostly = (4 * 1000 * 1000);/u64 sched_timeslice_ns __read_mostly = (2 * 1000 * 1000);/" ./kernel/sched/"${_cpusched}".c + fi + fi + fi + + # zenify + if [ "$_zenify" == "true" ]; then + echo "CONFIG_ZENIFY=y" >> ./.config + elif [ "$_zenify" == "false" ]; then + echo "# CONFIG_ZENIFY is not set" >> ./.config + fi + + # compiler optimization level + if [ "$_compileroptlevel" == "1" ]; then + echo "# CONFIG_CC_OPTIMIZE_FOR_PERFORMANCE_O3 is not set" >> ./.config + elif [ "$_compileroptlevel" == "2" ]; then + sed -i -e 's/CONFIG_CC_OPTIMIZE_FOR_PERFORMANCE=y/# CONFIG_CC_OPTIMIZE_FOR_PERFORMANCE is not set/' ./.config + echo "CONFIG_CC_OPTIMIZE_FOR_PERFORMANCE_O3=y" >> ./.config + elif [ "$_compileroptlevel" == "3" ]; then + sed -i -e 's/CONFIG_CC_OPTIMIZE_FOR_PERFORMANCE=y/# CONFIG_CC_OPTIMIZE_FOR_PERFORMANCE is not set/' ./.config + sed -i -e 's/# CONFIG_CC_OPTIMIZE_FOR_SIZE is not set/CONFIG_CC_OPTIMIZE_FOR_SIZE=y/' ./.config + echo "# CONFIG_CC_OPTIMIZE_FOR_PERFORMANCE_O3 is not set" >> ./.config + fi + + # cpu opt + if [ -n "$_processor_opt" ] && [ "$_processor_opt" != "native" ]; then + echo "# CONFIG_MNATIVE is not set" >> ./.config + fi + + if [ -n "$_processor_opt" ] && [ "$_processor_opt" != "generic" ]; then + sed -i -e 's/CONFIG_GENERIC_CPU=y/# CONFIG_GENERIC_CPU is not set/' ./.config + fi + + if [ "$_processor_opt" == "native" ]; then + echo "CONFIG_MNATIVE=y" >> ./.config + elif [ "$_processor_opt" == "k8" ]; then + sed -i -e 's/# CONFIG_MK8 is not set/CONFIG_MK8=y/' ./.config + elif [ "$_processor_opt" == "k8sse3" ]; then + sed -i -e 's/# CONFIG_MK8SSE3 is not set/CONFIG_MK8SSE3=y/' ./.config + elif [ "$_processor_opt" == "k10" ]; then + sed -i -e 's/# CONFIG_MK10 is not set/CONFIG_MK10=y/' ./.config + elif [ "$_processor_opt" == "barcelona" ]; then + sed -i -e 's/# CONFIG_MBARCELONA is not set/CONFIG_MBARCELONA=y/' ./.config + elif [ "$_processor_opt" == "bobcat" ]; then + sed -i -e 's/# CONFIG_MBOBCAT is not set/CONFIG_MBOBCAT=y/' ./.config + elif [ "$_processor_opt" == "jaguar" ]; then + sed -i -e 's/# CONFIG_MJAGUAR is not set/CONFIG_MJAGUAR=y/' ./.config + elif [ "$_processor_opt" == "bulldozer" ]; then + sed -i -e 's/# CONFIG_MBULLDOZER is not set/CONFIG_MBULLDOZER=y/' ./.config + elif [ "$_processor_opt" == "piledriver" ]; then + sed -i -e 's/# CONFIG_MPILEDRIVER is not set/CONFIG_MPILEDRIVER=y/' ./.config + elif [ "$_processor_opt" == "steamroller" ]; then + sed -i -e 's/# CONFIG_MSTEAMROLLER is not set/CONFIG_MSTEAMROLLER=y/' ./.config + elif [ "$_processor_opt" == "excavator" ]; then + sed -i -e 's/# CONFIG_MEXCAVATOR is not set/CONFIG_MEXCAVATOR=y/' ./.config + elif [ "$_processor_opt" == "zen" ]; then + sed -i -e 's/# CONFIG_MZEN is not set/CONFIG_MZEN=y/' ./.config + elif [ "$_processor_opt" == "zen2" ]; then + sed -i -e 's/# CONFIG_MZEN2 is not set/CONFIG_MZEN2=y/' ./.config + elif [ "$_processor_opt" == "mpsc" ]; then + sed -i -e 's/# CONFIG_MPSC is not set/CONFIG_MPSC=y/' ./.config + elif [ "$_processor_opt" == "atom" ]; then + sed -i -e 's/# CONFIG_MATOM is not set/CONFIG_MATOM=y/' ./.config + elif [ "$_processor_opt" == "core2" ]; then + sed -i -e 's/# CONFIG_MCORE2 is not set/CONFIG_MCORE2=y/' ./.config + elif [ "$_processor_opt" == "nehalem" ]; then + sed -i -e 's/# CONFIG_MNEHALEM is not set/CONFIG_MNEHALEM=y/' ./.config + elif [ "$_processor_opt" == "westmere" ]; then + sed -i -e 's/# CONFIG_MWESTMERE is not set/CONFIG_MWESTMERE=y/' ./.config + elif [ "$_processor_opt" == "silvermont" ]; then + sed -i -e 's/# CONFIG_MSILVERMONT is not set/CONFIG_MSILVERMONT=y/' ./.config + elif [ "$_processor_opt" == "sandybridge" ]; then + sed -i -e 's/# CONFIG_MSANDYBRIDGE is not set/CONFIG_MSANDYBRIDGE=y/' ./.config + elif [ "$_processor_opt" == "ivybridge" ]; then + sed -i -e 's/# CONFIG_MIVYBRIDGE is not set/CONFIG_MIVYBRIDGE=y/' ./.config + elif [ "$_processor_opt" == "haswell" ]; then + sed -i -e 's/# CONFIG_MHASWELL is not set/CONFIG_MHASWELL=y/' ./.config + elif [ "$_processor_opt" == "broadwell" ]; then + sed -i -e 's/# CONFIG_MBROADWELL is not set/CONFIG_MBROADWELL=y/' ./.config + elif [ "$_processor_opt" == "skylake" ]; then + sed -i -e 's/# CONFIG_MSKYLAKE is not set/CONFIG_MSKYLAKE=y/' ./.config + elif [ "$_processor_opt" == "skylakex" ]; then + sed -i -e 's/# CONFIG_MSKYLAKEX is not set/CONFIG_MSKYLAKEX=y/' ./.config + elif [ "$_processor_opt" == "cannonlake" ]; then + sed -i -e 's/# CONFIG_MCANNONLAKE is not set/CONFIG_MCANNONLAKE=y/' ./.config + elif [ "$_processor_opt" == "icelake" ]; then + sed -i -e 's/# CONFIG_MICELAKE is not set/CONFIG_MICELAKE=y/' ./.config + elif [ "$_processor_opt" == "goldmont" ]; then + sed -i -e 's/# CONFIG_MGOLDMONT is not set/CONFIG_MGOLDMONT=y/' ./.config + elif [ "$_processor_opt" == "goldmontplus" ]; then + sed -i -e 's/# CONFIG_MGOLDMONTPLUS is not set/CONFIG_MGOLDMONTPLUS=y/' ./.config + elif [ "$_processor_opt" == "cascadelake" ]; then + sed -i -e 's/# CONFIG_MCASCADELAKE is not set/CONFIG_MCASCADELAKE=y/' ./.config + fi + + # irq threading + if [ "$_irq_threading" == "true" ]; then + echo "CONFIG_FORCE_IRQ_THREADING=y" >> ./.config + elif [ "$_irq_threading" == "false" ]; then + echo "# CONFIG_FORCE_IRQ_THREADING is not set" >> ./.config + fi + + # smt nice + if [ "$_smt_nice" == "true" ]; then + echo "CONFIG_SMT_NICE=y" >> ./.config + elif [ "$_smt_nice" == "false" ]; then + echo "# CONFIG_SMT_NICE is not set" >> ./.config + fi + + # random trust cpu + if [ "$_random_trust_cpu" == "true" ]; then + sed -i -e 's/# CONFIG_RANDOM_TRUST_CPU is not set/CONFIG_RANDOM_TRUST_CPU=y/' ./.config + fi + + # rq sharing + if [ "$_runqueue_sharing" == "none" ]; then + echo -e "CONFIG_RQ_NONE=y\n# CONFIG_RQ_SMT is not set\n# CONFIG_RQ_MC is not set\n# CONFIG_RQ_MC_LLC is not set\n# CONFIG_RQ_SMP is not set\n# CONFIG_RQ_ALL is not set" >> ./.config + elif [ -z "$_runqueue_sharing" ] || [ "$_runqueue_sharing" == "smt" ]; then + echo -e "# CONFIG_RQ_NONE is not set\nCONFIG_RQ_SMT=y\n# CONFIG_RQ_MC is not set\n# CONFIG_RQ_MC_LLC is not set\n# CONFIG_RQ_SMP is not set\n# CONFIG_RQ_ALL is not set" >> ./.config + elif [ "$_runqueue_sharing" == "mc" ]; then + echo -e "# CONFIG_RQ_NONE is not set\n# CONFIG_RQ_SMT is not set\nCONFIG_RQ_MC=y\n# CONFIG_RQ_MC_LLC is not set\n# CONFIG_RQ_SMP is not set\n# CONFIG_RQ_ALL is not set" >> ./.config + elif [ "$_runqueue_sharing" == "smp" ]; then + echo -e "# CONFIG_RQ_NONE is not set\n# CONFIG_RQ_SMT is not set\n# CONFIG_RQ_MC is not set\n# CONFIG_RQ_MC_LLC is not set\nCONFIG_RQ_SMP=y\n# CONFIG_RQ_ALL is not set" >> ./.config + elif [ "$_runqueue_sharing" == "all" ]; then + echo -e "# CONFIG_RQ_NONE is not set\n# CONFIG_RQ_SMT is not set\n# CONFIG_RQ_MC is not set\n# CONFIG_RQ_MC_LLC is not set\n# CONFIG_RQ_SMP is not set\nCONFIG_RQ_ALL=y" >> ./.config + elif [ "$_runqueue_sharing" == "mc-llc" ]; then + echo -e "# CONFIG_RQ_NONE is not set\n# CONFIG_RQ_SMT is not set\n# CONFIG_RQ_MC is not set\nCONFIG_RQ_MC_LLC=y\n# CONFIG_RQ_SMP is not set\n# CONFIG_RQ_ALL is not set" >> ./.config + fi + + # timer freq + if [ -n "$_timer_freq" ] && [ "$_timer_freq" != "300" ]; then + sed -i -e 's/CONFIG_HZ_300=y/# CONFIG_HZ_300 is not set/' ./.config + sed -i -e 's/CONFIG_HZ_300_NODEF=y/# CONFIG_HZ_300_NODEF is not set/' ./.config + if [ "$_timer_freq" == "1000" ]; then + sed -i -e 's/# CONFIG_HZ_1000 is not set/CONFIG_HZ_1000=y/' ./.config + sed -i -e 's/CONFIG_HZ=300/CONFIG_HZ=1000/' ./.config + echo "# CONFIG_HZ_500 is not set" >> ./.config + echo "# CONFIG_HZ_500_NODEF is not set" >> ./.config + echo "# CONFIG_HZ_750 is not set" >> ./.config + echo "# CONFIG_HZ_750_NODEF is not set" >> ./.config + echo "CONFIG_HZ_1000_NODEF=y" >> ./.config + echo "# CONFIG_HZ_250_NODEF is not set" >> ./.config + echo "# CONFIG_HZ_300_NODEF is not set" >> ./.config + elif [ "$_timer_freq" == "750" ]; then + sed -i -e 's/CONFIG_HZ=300/CONFIG_HZ=750/' ./.config + echo "# CONFIG_HZ_500 is not set" >> ./.config + echo "# CONFIG_HZ_500_NODEF is not set" >> ./.config + echo "CONFIG_HZ_750=y" >> ./.config + echo "CONFIG_HZ_750_NODEF=y" >> ./.config + echo "# CONFIG_HZ_1000_NODEF is not set" >> ./.config + echo "# CONFIG_HZ_250_NODEF is not set" >> ./.config + echo "# CONFIG_HZ_300_NODEF is not set" >> ./.config + elif [ "$_timer_freq" == "500" ]; then + sed -i -e 's/CONFIG_HZ=300/CONFIG_HZ=500/' ./.config + echo "CONFIG_HZ_500=y" >> ./.config + echo "CONFIG_HZ_500_NODEF=y" >> ./.config + echo "# CONFIG_HZ_750 is not set" >> ./.config + echo "# CONFIG_HZ_750_NODEF is not set" >> ./.config + echo "# CONFIG_HZ_1000_NODEF is not set" >> ./.config + echo "# CONFIG_HZ_250_NODEF is not set" >> ./.config + echo "# CONFIG_HZ_300_NODEF is not set" >> ./.config + elif [ "$_timer_freq" == "100" ]; then + sed -i -e 's/CONFIG_HZ=300/CONFIG_HZ=100/' ./.config + echo "# CONFIG_HZ_500 is not set" >> ./.config + echo "# CONFIG_HZ_750 is not set" >> ./.config + echo "# CONFIG_HZ_1000_NODEF is not set" >> ./.config + echo "# CONFIG_HZ_750_NODEF is not set" >> ./.config + echo "# CONFIG_HZ_500_NODEF is not set" >> ./.config + echo "# CONFIG_HZ_250_NODEF is not set" >> ./.config + echo "# CONFIG_HZ_300_NODEF is not set" >> ./.config + echo "CONFIG_HZ_100=y" >> ./.config + echo "CONFIG_HZ_100_NODEF=y" >> ./.config + fi + elif [ "${_cpusched}" == "MuQSS" ] && [ -z "$_timer_freq" ]; then + sed -i -e 's/CONFIG_HZ=300/CONFIG_HZ=100/' ./.config + echo "# CONFIG_HZ_500 is not set" >> ./.config + echo "# CONFIG_HZ_750 is not set" >> ./.config + echo "# CONFIG_HZ_1000_NODEF is not set" >> ./.config + echo "# CONFIG_HZ_750_NODEF is not set" >> ./.config + echo "# CONFIG_HZ_500_NODEF is not set" >> ./.config + echo "# CONFIG_HZ_250_NODEF is not set" >> ./.config + echo "# CONFIG_HZ_300_NODEF is not set" >> ./.config + echo "CONFIG_HZ_100=y" >> ./.config + echo "CONFIG_HZ_100_NODEF=y" >> ./.config + else + sed -i -e 's/CONFIG_HZ_300=y/# CONFIG_HZ_300 is not set/' ./.config + sed -i -e 's/CONFIG_HZ_300_NODEF=y/# CONFIG_HZ_300_NODEF is not set/' ./.config + sed -i -e 's/CONFIG_HZ=300/CONFIG_HZ=500/' ./.config + echo "CONFIG_HZ_500=y" >> ./.config + echo "CONFIG_HZ_500_NODEF=y" >> ./.config + echo "# CONFIG_HZ_250_NODEF is not set" >> ./.config + echo "# CONFIG_HZ_300_NODEF is not set" >> ./.config + fi + + # default cpu gov + if [ "$_default_cpu_gov" == "performance" ]; then + sed -i -e 's/CONFIG_CPU_FREQ_DEFAULT_GOV_SCHEDUTIL=y/# CONFIG_CPU_FREQ_DEFAULT_GOV_SCHEDUTIL is not set/' ./.config + sed -i -e 's/# CONFIG_CPU_FREQ_DEFAULT_GOV_PERFORMANCE is not set/CONFIG_CPU_FREQ_DEFAULT_GOV_PERFORMANCE=y/' ./.config + elif [ "$_default_cpu_gov" == "ondemand" ]; then + sed -i -e 's/CONFIG_CPU_FREQ_DEFAULT_GOV_SCHEDUTIL=y/# CONFIG_CPU_FREQ_DEFAULT_GOV_SCHEDUTIL is not set/' ./.config + sed -i -e 's/# CONFIG_CPU_FREQ_DEFAULT_GOV_ONDEMAND is not set/CONFIG_CPU_FREQ_DEFAULT_GOV_ONDEMAND=y/' ./.config + fi + + # ACPI_CPUFREQ disablement + if [ "$_disable_acpi_cpufreq" == "true" ]; then + sed -i -e 's/CONFIG_X86_ACPI_CPUFREQ=m/# CONFIG_X86_ACPI_CPUFREQ is not set/' ./.config + fi + + # ftrace + if [ -z "$_ftracedisable" ]; then + plain "" + plain "Disable FUNCTION_TRACER/GRAPH_TRACER? Lowers overhead but limits debugging" + plain "and analyzing of kernel functions." + read -rp "`echo $' > N/y : '`" CONDITION2; + fi + if [ "$CONDITION2" == "y" ] || [ "$_ftracedisable" == "true" ]; then + sed -i -e 's/CONFIG_FUNCTION_TRACER=y/# CONFIG_FUNCTION_TRACER is not set/' ./.config + sed -i -e 's/CONFIG_FUNCTION_GRAPH_TRACER=y/# CONFIG_FUNCTION_GRAPH_TRACER is not set/' ./.config + fi + + # disable numa + if [ -z "$_numadisable" ]; then + plain "" + plain "Disable NUMA? Lowers overhead, but breaks CUDA/NvEnc on Nvidia if disabled." + plain "https://bbs.archlinux.org/viewtopic.php?id=239174" + read -rp "`echo $' > N/y : '`" CONDITION3; + fi + if [ "$CONDITION3" == "y" ] || [ "$_numadisable" == "true" ]; then + # disable NUMA since 99.9% of users do not have multiple CPUs but do have multiple cores in one CPU + sed -i -e 's/CONFIG_NUMA=y/# CONFIG_NUMA is not set/' \ + -i -e '/CONFIG_AMD_NUMA=y/d' \ + -i -e '/CONFIG_X86_64_ACPI_NUMA=y/d' \ + -i -e '/CONFIG_NODES_SPAN_OTHER_NODES=y/d' \ + -i -e '/# CONFIG_NUMA_EMU is not set/d' \ + -i -e '/CONFIG_NODES_SHIFT=6/d' \ + -i -e '/CONFIG_NEED_MULTIPLE_NODES=y/d' \ + -i -e '/CONFIG_USE_PERCPU_NUMA_NODE_ID=y/d' \ + -i -e '/CONFIG_ACPI_NUMA=y/d' ./.config + fi + + # tickless + if [ -z "$_tickless" ]; then + plain "" + plain "Use CattaRappa mode (Tickless/Dynticks) ?" + plain "Can give higher performances in many cases but lower consistency on some hardware." + plain "Just tickless idle can perform better with some platforms (mostly AMD) or CPU schedulers (mostly MuQSS)." + if [ "${_cpusched}" == "MuQSS" ]; then + read -rp "`echo $'\n 0.No, use periodic ticks\n 1.Yes, full tickless baby!\n > 2.Just tickless idle plz\n [0-2?]: '`" CONDITION4; + else + read -rp "`echo $'\n 0.No, use periodic ticks\n > 1.Yes, full tickless baby!\n 2.Just tickless idle plz\n [0-2?]: '`" CONDITION4; + fi + fi + if [ "$CONDITION4" == "0" ] || [ "$_tickless" == "0" ]; then + echo "# CONFIG_NO_HZ_FULL_NODEF is not set" >> ./.config + sed -i -e 's/# CONFIG_HZ_PERIODIC is not set/CONFIG_HZ_PERIODIC=y/' ./.config + sed -i -e 's/CONFIG_NO_HZ_IDLE=y/# CONFIG_NO_HZ_IDLE is not set/' ./.config + sed -i -e 's/CONFIG_NO_HZ_FULL=y/# CONFIG_NO_HZ_FULL is not set/' ./.config + sed -i -e 's/CONFIG_NO_HZ=y/# CONFIG_NO_HZ is not set/' ./.config + sed -i -e 's/CONFIG_NO_HZ_COMMON=y/# CONFIG_NO_HZ_COMMON is not set/' ./.config + elif [ "$CONDITION4" == "2" ] || [ "$_tickless" == "2" ]; then + echo "# CONFIG_NO_HZ_FULL_NODEF is not set" >> ./.config + sed -i -e 's/CONFIG_HZ_PERIODIC=y/# CONFIG_HZ_PERIODIC is not set/' ./.config + sed -i -e 's/# CONFIG_NO_HZ_IDLE is not set/CONFIG_NO_HZ_IDLE=y/' ./.config + sed -i -e 's/CONFIG_NO_HZ_FULL=y/# CONFIG_NO_HZ_FULL is not set/' ./.config + sed -i -e 's/# CONFIG_NO_HZ is not set/CONFIG_NO_HZ=y/' ./.config + sed -i -e 's/# CONFIG_NO_HZ_COMMON is not set/CONFIG_NO_HZ_COMMON=y/' ./.config + else + if [ "${_cpusched}" == "MuQSS" ]; then + echo "# CONFIG_NO_HZ_FULL_NODEF is not set" >> ./.config + sed -i -e 's/CONFIG_HZ_PERIODIC=y/# CONFIG_HZ_PERIODIC is not set/' ./.config + sed -i -e 's/# CONFIG_NO_HZ_IDLE is not set/CONFIG_NO_HZ_IDLE=y/' ./.config + sed -i -e 's/CONFIG_NO_HZ_FULL=y/# CONFIG_NO_HZ_FULL is not set/' ./.config + sed -i -e 's/# CONFIG_NO_HZ is not set/CONFIG_NO_HZ=y/' ./.config + sed -i -e 's/# CONFIG_NO_HZ_COMMON is not set/CONFIG_NO_HZ_COMMON=y/' ./.config + else + echo "CONFIG_NO_HZ_FULL_NODEF=y" >> ./.config + sed -i -e 's/CONFIG_HZ_PERIODIC=y/# CONFIG_HZ_PERIODIC is not set/' ./.config + sed -i -e 's/CONFIG_NO_HZ_IDLE=y/# CONFIG_NO_HZ_IDLE is not set/' ./.config + sed -i -e 's/# CONFIG_NO_HZ_FULL is not set/CONFIG_NO_HZ_FULL=y/' ./.config + sed -i -e 's/# CONFIG_NO_HZ is not set/CONFIG_NO_HZ=y/' ./.config + sed -i -e 's/# CONFIG_NO_HZ_COMMON is not set/CONFIG_NO_HZ_COMMON=y/' ./.config + echo "CONFIG_CONTEXT_TRACKING=y" >> ./.config + echo "# CONFIG_CONTEXT_TRACKING_FORCE is not set" >> ./.config + fi + fi + + # voluntary preempt + if [ -z "$_voluntary_preempt" ]; then + plain "" + plain "Use explicit preemption points?" + plain "It can improve latency on PDS (at the cost of throughput)" + plain "and improve throughput on other schedulers (at the cost of latency)" + read -rp "`echo $' > N/y : '`" CONDITION5; + fi + if [ "$CONDITION5" == "y" ] || [ "$_voluntary_preempt" == "true" ]; then + sed -i -e 's/CONFIG_PREEMPT=y/# CONFIG_PREEMPT is not set/' ./.config + sed -i -e 's/CONFIG_PREEMPT_LL=y/# CONFIG_PREEMPT_LL is not set/' ./.config + sed -i -e 's/# CONFIG_PREEMPT_VOLUNTARY is not set/CONFIG_PREEMPT_VOLUNTARY=y/' ./.config + fi + + # Open Firmware support + if [ -z "$_OFenable" ]; then + plain "" + plain "Enable Device Tree and Open Firmware support?" + read -rp "`echo $' > N/y : '`" CONDITION6; + fi + if [ "$CONDITION6" == "y" ] || [ "$_OFenable" == "true" ]; then + sed -i -e 's/# CONFIG_OF is not set/CONFIG_OF=y/' ./.config + fi + + # acs override + if [ -z "$_acs_override" ]; then + plain "" + plain "Use ACS override patch?" + plain "https://wiki.archlinux.org/index.php/PCI_passthrough_via_OVMF#Bypassing_the_IOMMU_groups_.28ACS_override_patch.29" + read -rp "`echo $' > N/y : '`" CONDITION7; + fi + if [ "$CONDITION7" == "y" ] || [ "$_acs_override" == "true" ]; then + patch -Np1 -i ../0006-add-acs-overrides_iommu.patch + fi + + # bcachefs +# if [ -z "$_bcachefs" ]; then +# plain "" +# plain "Add Bcache filesystem support? You'll have to install bcachefs-tools-git from AUR for utilities." +# plain "https://bcachefs.org/" +# read -rp "`echo $' > N/y : '`" CONDITION8; +# fi +# if [ "$CONDITION8" == "y" ] || [ "$_bcachefs" == "true" ]; then +# patch -Np1 -i ../0008-5.5-bcachefs.patch +# echo "CONFIG_BCACHEFS_FS=m" >> ./.config +# echo "CONFIG_BCACHEFS_QUOTA=y" >> ./.config +# echo "CONFIG_BCACHEFS_POSIX_ACL=y" >> ./.config +# echo "# CONFIG_BCACHEFS_DEBUG is not set" >> ./.config +# echo "# CONFIG_BCACHEFS_TESTS is not set" >> ./.config +# echo "# CONFIG_DEBUG_CLOSURES is not set" >> ./.config +# fi + + # fsync support + if [ -z "$_fsync" ]; then + plain "" + plain "Enable support for fsync, an experimental replacement for esync in Valve Proton 4.11+" + plain "https://steamcommunity.com/games/221410/announcements/detail/2957094910196249305" + read -rp "`echo $' > N/y : '`" CONDITION9; + fi + if [ "$CONDITION9" == "y" ] || [ "$_fsync" == "true" ]; then + patch -Np1 -i ../0007-v5.5-fsync.patch + fi + + # ZFS fix + if [ -z "$_zfsfix" ]; then + plain "" + plain "Add back missing symbol for AES-NI/AVX support on ZFS" + plain "https://github.com/NixOS/nixpkgs/blob/master/pkgs/os-specific/linux/kernel/export_kernel_fpu_functions_5_3.patch" + read -rp "`echo $' > N/y : '`" CONDITION11; + fi + if [ "$CONDITION11" == "y" ] || [ "$_zfsfix" == "true" ]; then + patch -Np1 -i ../0011-ZFS-fix.patch + fi + + # Community patches + if [ -n "$_community_patches" ]; then + _community_patches=($_community_patches) + for _p in ${_community_patches[@]}; do + ln -s "$_where"/../community-patches/linux55-tkg/$_p "$_where"/ + done + fi + + # userpatches + if [ "$_user_patches" == "true" ]; then + _userpatch_target="linux-${_basekernel}" + _userpatch_ext="my" + user_patcher + fi + + # Community patches removal + for _p in ${_community_patches[@]}; do + rm -f "$_where"/$_p + done + + # don't run depmod on 'make install'. We'll do this ourselves in packaging + sed -i '2iexit 0' scripts/depmod.sh + + # get kernel version + make prepare + + # modprobed-db + if [ -z "$_modprobeddb" ]; then + plain "" + plain "Use modprobed db to clean config from unneeded modules?" + plain "Speeds up compilation considerably. Requires root." + plain "https://wiki.archlinux.org/index.php/Modprobed-db" + plain "!!!! Make sure to have a well populated db !!!!" + read -rp "`echo $' > N/y : '`" CONDITIONMPDB; + fi + if [ "$CONDITIONMPDB" == "y" ] || [ "$_modprobeddb" == "true" ]; then + sudo modprobed-db recall + make localmodconfig + fi + + if [ true = "$_config_fragments" ]; then + local fragments=() + mapfile -d '' -t fragments < <(find "$_where" -type f -name "*.myfrag" -print0) + + if [ true = "$_config_fragments_no_confirm" ]; then + printf 'Using config fragment %s\n' "${fragments[@]#$_where/}" + else + for i in "${!fragments[@]}"; do + while true; do + read -r -p 'Found config fragment '"${fragments[$i]#$_where/}"', apply it? [y/N] ' CONDITIONMPDB + CONDITIONMPDB="$(printf '%s' "$CONDITIONMPDB" | tr '[:upper:]' '[:lower:]')" + case "$CONDITIONMPDB" in + y|yes) + break;; + n|no|'') + unset fragments[$i] + break;; + *) + echo 'Please answer with yes or no' + esac + done + done + fi + + if [ 0 -lt "${#fragments[@]}" ]; then + scripts/kconfig/merge_config.sh -m .config "${fragments[@]}" + fi + fi + + # menuconfig / nconfig + if [ -z "$_menunconfig" ]; then + plain "" + plain "*Optional* For advanced users - Do you want to use make menuconfig or nconfig" + plain "to configure the kernel before building it?" + plain "If you do, make sure your terminal is currently" + plain "at least 19 lines by 80 columns large or you'll get an error :D" + read -rp "`echo $' > 0. nope\n 1. menuconfig\n 2. nconfig\n choice[0-2?]: '`" CONDITIONMNC; + _menunconfig="$CONDITIONMNC" + fi + if [ 1 = "$_menunconfig" ]; then + cp .config .config.orig + make menuconfig + elif [ 2 = "$_menunconfig" ]; then + cp .config .config.orig + make nconfig + else + # rewrite configuration + yes "" | make config >/dev/null + fi + if [ 1 = "$_menunconfig" ] || [ 2 = "$_menunconfig" ]; then + if [ -z "${_diffconfig}" ]; then + while true; do + read -r -p 'Generate a config fragment from your changes? [y/N] ' CONDITIONF + CONDITIONF="$(printf '%s' "$CONDITIONF" | tr '[:upper:]' '[:lower:]')" + case "$CONDITIONF" in + y|yes) + _diffconfig=true + break;; + n|no|'') + _diffconfig=false + break;; + *) + echo 'Please answer with yes or no' + esac + done + fi + if [ true = "$_diffconfig" ]; then + if [ -z "$_diffconfig_name" ]; then + IFS= read -r -p 'Filename for the config fragment [leave empty to not generate fragment]: ' _diffconfig_name + fi + if [ -z "$_diffconfig_name" ]; then + echo 'No file name given, not generating config fragment.' + else ( + prev_pwd="${PWD:-$(pwd)}" + cd "$_where" + "${prev_pwd}/scripts/diffconfig" -m "${prev_pwd}/.config.orig" "${prev_pwd}/.config" > "$_diffconfig_name" + ) fi + fi + rm .config.orig + fi + + make -s kernelrelease > version + msg2 "Prepared %s version %s" "$pkgbase" "$(&1 ) 3>&1 1>&2 2>&3 ) || _runtime=$( time ( make ${_force_all_threads} LOCALVERSION= bzImage modules 2>&1 ) 3>&1 1>&2 2>&3 ) +} + +hackbase() { + pkgdesc="The $pkgdesc kernel and modules" + depends=('coreutils' 'kmod' 'initramfs') + optdepends=('linux-docs: Kernel hackers manual - HTML documentation that comes with the Linux kernel.' + 'crda: to set the correct wireless channels of your country.' + 'modprobed-db: Keeps track of EVERY kernel module that has ever been probed. Useful for make localmodconfig.' + 'nvidia-tkg: NVIDIA drivers for all installed kernels - non-dkms version.' + 'nvidia-dkms-tkg: NVIDIA drivers for all installed kernels - dkms version.' + 'update-grub: Simple wrapper around grub-mkconfig.') + provides=("linux=${pkgver}" "${pkgbase}") + + cd "${srcdir}/linux-${_basekernel}" + + # get kernel version + local _kernver="$(=1000, default is 4000)* + +PDS-mq was originally created by Alfred Chen : http://cchalpha.blogspot.com/ +While he dropped it with kernel 5.1 in favor of its BMQ evolution/rework, my pretty bad gaming experiences with BMQ up to this point convinced me to keep PDS afloat for as long as it'll make sense/I'll be able to. + +You can find prebuilts on chaotic-aur, but if you need the extra-spice of per-arch optimized prebuilts, you can find PDS and MuQSS variants daily builds here : https://repo.kitsuna.net/ - Thanks to LordKitsuna. + +Comes with a slightly modified Arch config asking for a few core personalization settings at compilation time. +If you want to streamline your kernel config for lower footprint and faster compilations : https://wiki.archlinux.org/index.php/Modprobed-db +You can enable support for it at the beginning of the PKGBUILD file. Make sure to read everything you need to know about it. + +## Other stuff included: +- Per-CPU-arch native optimizations +- memory management and swapping tweaks +- scheduling tweaks +- using prefered raid6 gen function directly +- using lz4 algo for zswap by default +- built-in Thinkpad hardware functions driver / embedded controller LPC3 functions / SMAPI support +- optional "Zenify" patchset using core blk, mm and scheduler tweaks from Zen +- CFS tweaks +- using yeah TCP congestion algo by default +- using cake network queue management system +- using vm.max_map_count=262144 by default +- intel E1000 fixes +- cherry-picked clear linux patches +- **optional** overrides for missing ACS capabilities +- **optional** ZFS fpu symbols +- **optional** Fsync support (proton) diff --git a/linux55-tkg/customization.cfg b/linux55-tkg/customization.cfg new file mode 100644 index 0000000..95b39a3 --- /dev/null +++ b/linux55-tkg/customization.cfg @@ -0,0 +1,175 @@ +# linux55-TkG config file + + +#### MISC OPTIONS #### + +# External config file to use - If the given file exists in path, it will override default config (customization.cfg) - Default is ~/.config/frogminer/linux52-tkg.cfg +_EXT_CONFIG_PATH=~/.config/frogminer/linux55-tkg.cfg + +# Set to anything else than "true" to limit cleanup operations and keep source and files generated during compilation. +# Default is "true". +_NUKR="true" + +# Custom compiler root dirs - Leave empty to use system compilers +# Example: CUSTOM_GCC_PATH="/home/frog/PKGBUILDS/mostlyportable-gcc/gcc-mostlyportable-9.2.0" +CUSTOM_GCC_PATH="" + +# Set to the number corresponding to a predefined profile to use it. Current list of available profiles : +# 1 - Custom (meaning nothing will be enforced and you get to configure everything) +# 2 - Ryzen desktop (performance) +# 3 - Generic Desktop (Performance) +_OPTIPROFILE="" + +# Set to true to bypass makepkg.conf and use all available threads for compilation. False will respect your makepkg.conf options. +_force_all_threads="true" + +# Set to true to use modprobed db to clean config from unneeded modules. Speeds up compilation considerably. Requires root - https://wiki.archlinux.org/index.php/Modprobed-db +# !!!! Make sure to have a well populated db !!!! - Leave empty to be asked about it at build time +_modprobeddb="false" + +# Set to "1" to call make menuconfig or "2" to call make nconfig before building the kernel. Set to false to disable and skip the prompt. +_menunconfig="" + +# Set to true to generate a kernel config fragment from your changes in menuconfig/nconfig. Set to false to disable and skip the prompt. +_diffconfig="" + +# Set to the file name where the generated config fragment should be written to. Only used if _diffconfig is active. +_diffconfig_name="" + +#### KERNEL OPTIONS #### + +# Name of the default config file to use from the linux???-tkg-config folder. Arch default is "config.x86_64" and Arch hardened is "config_hardened.x86_64". +# To get a complete hardened setup, you have to use "cfs" as _cpusched +_configfile="config.x86_64" + +# Disable some non-module debugging - See PKGBUILD for the list +_debugdisable="false" + +# LEAVE AN EMPTY VALUE TO BE PROMPTED ABOUT FOLLOWING OPTIONS AT BUILD TIME + +# CPU scheduler - Options are "pds", muqss, "bmq", "cfs" or "cfsturbo" (experimental, for power CPUs https://lkml.org/lkml/2019/7/25/296) +# "pds" is the recommended option for gaming +_cpusched="" + +# CPU sched_yield_type - Choose what sort of yield sched_yield will perform +# For PDS and MuQSS: 0: No yield. (Recommended option for gaming on PDS and MuQSS) +# 1: Yield only to better priority/deadline tasks. (Default - can be unstable with PDS on some platforms) +# 2: Expire timeslice and recalculate deadline. (Usually the slowest option for PDS and MuQSS, not recommended) +# For BMQ: 0: No yield. +# 1: Deboost and requeue task. (Default) +# 2: Set rq skip task. +_sched_yield_type="" + +# Round Robin interval is the longest duration two tasks with the same nice level will be delayed for. When CPU time is requested by a task, it receives a time slice equal +# to the rr_interval in addition to a virtual deadline. When using yield_type 2, a low value can help offset the disadvantages of rescheduling a process that has yielded. +# MuQSS default: 6ms" +# PDS default: 4ms" +# BMQ default: 4ms" +# Set to "1" for 2ms, "2" for 4ms, "3" for 6ms, "4" for 8ms, or "default" to keep the chosen scheduler defaults. +_rr_interval="" + +# Set to "true" to disable FUNCTION_TRACER/GRAPH_TRACER, lowering overhead but limiting debugging and analyzing of kernel functions - Kernel default is "false" +_ftracedisable="false" + +# Set to "true" to disable NUMA, lowering overhead, but breaking CUDA/NvEnc on Nvidia equipped systems - Kernel default is "false" +_numadisable="false" + +# Set to "1" to use CattaRappa mode (enabling full tickless), "2" for tickless idle only, or "0" for periodic ticks. +# Full tickless can give higher performances in various cases but, depending on hardware, lower consistency. Just tickless idle can perform better on some platforms (mostly AMD based). +_tickless="" + +# Setting this to to "true" can improve latency on PDS (at the cost of throughput) and improve throughput on other schedulers (at the cost of latency) - Can improve VMs performance - Kernel default is "false" +_voluntary_preempt="" + +# Set to "true" to enable Device Tree and Open Firmware support. If you don't know about it, you don't need it - Default is "false" +_OFenable="false" + +# Set to "true" to use ACS override patch - https://wiki.archlinux.org/index.php/PCI_passthrough_via_OVMF#Bypassing_the_IOMMU_groups_.28ACS_override_patch.29 - Kernel default is "false" +_acs_override="" + +# Set to "true" to add back missing symbol for AES-NI/AVX support on ZFS - https://github.com/NixOS/nixpkgs/blob/master/pkgs/os-specific/linux/kernel/export_kernel_fpu_functions.patch - Kernel default is "false" +_zfsfix="" + +# Set to "true" to enable support for fsync, an experimental replacement for esync found in Valve Proton 4.11+ - https://steamcommunity.com/games/221410/announcements/detail/2957094910196249305 +_fsync="" + +# A selection of patches from Zen/Liquorix kernel and additional tweaks for a better gaming experience (ZENIFY) - Default is "true" +_zenify="true" + +# compiler optimization level - 1. Optimize for performance (-O2); 2. Optimize harder (-O3); 3. Optimize for size (-Os) - Kernel default is "1" +_compileroptlevel="1" + +# CPU compiler optimizations - Defaults to generic optimizations if left empty +# AMD CPUs : "k8" "k8sse3" "k10" "barcelona" "bobcat" "jaguar" "bulldozer" "piledriver" "steamroller" "excavator" "zen" "zen2" +# Intel CPUs : "mpsc"(P4 & older Netburst based Xeon) "atom" "core2" "nehalem" "westmere" "silvermont" "sandybridge" "ivybridge" "haswell" "broadwell" "skylake" "skylakex" "cannonlake" "icelake" "goldmont" "goldmontplus" "cascadelake" +# Other options : +# - "generic" (to share the package between machines with different CPUs) +# - "native" (use compiler autodetection and will prompt for P6_NOPS - Selecting your arch manually in the list above is recommended instead of this option) +_processor_opt="" + +# MuQSS only - Make IRQ threading compulsory (FORCE_IRQ_THREADING) - Default is "false" +_irq_threading="false" + +# MuQSS and PDS only - SMT (Hyperthreading) aware nice priority and policy support (SMT_NICE) - Kernel default is "true" - You can disable this on non-SMT/HT CPUs for lower overhead +_smt_nice="" + +# Trust the CPU manufacturer to initialize Linux's CRNG (RANDOM_TRUST_CPU) - Kernel default is "false" +_random_trust_cpu="false" + +# MuQSS only - CPU scheduler runqueue sharing - No sharing (RQ_NONE), SMT (hyperthread) siblings (RQ_SMT), Multicore siblings (RQ_MC), Symmetric Multi-Processing (RQ_SMP), NUMA (RQ_ALL) +# Valid values are "none", "smt", "mc", "mc-llc"(for zen), "smp", "all" - Kernel default is "smt" +_runqueue_sharing="" + +# Timer frequency - "100" "500", "750" or "1000" - More options available in kernel config prompt when left empty depending on selected cpusched - Kernel default is "500" - For MuQSS, 100Hz is recommended +_timer_freq="" + +# Default CPU governor - "performance", "ondemand", "schedutil" or leave empty for default (schedutil) +_default_cpu_gov="ondemand" + +# Use an aggressive ondemand governor instead of default ondemand to improve performance on low loads/high core count CPUs while keeping some power efficiency from frequency scaling. +# It still requires you to either set ondemand as default governor or to select it some way. +_aggressive_ondemand="true" + +# On some platforms, an acpi_cpufreq bug affects performance negatively. Set to "true" to disable it as a workaround, but it will use more power. +# https://github.com/Tk-Glitch/PKGBUILDS/issues/263 +_disable_acpi_cpufreq="" + +# You can pass a default set of kernel command line options here - example: "intel_pstate=passive nowatchdog amdgpu.ppfeaturemask=0xfffd7fff mitigations=off" +_custom_commandline="intel_pstate=passive" + +# Set to false to use Arch's default TTY font (TER16x32), which is a bit too huge on many displays +_font_autoselect="true" + + +#### SPESHUL OPTION #### + +# If you want to bypass the stock naming scheme and enforce something else (example : "linux") - Useful for some bootloaders requiring manual entry editing on each release. +# !!! It will also change pkgname - If you don't explicitely need this, don't use it !!! +_custom_pkgbase="" + + +#### USER PATCHES #### + +# community patches - add patches (separated by a space) of your choice by name from the community-patches dir +# example: _community_patches="clear_nack_in_tend_isr.myrevert ffb_regression_fix.mypatch 0008-drm-amd-powerplay-force-the-trim-of-the-mclk-dpm-levels-if-OD-is-enabled.mypatch" +_community_patches="" + +# You can use your own patches by putting them in the same folder as the PKGBUILD and giving them the .mypatch extension. +# You can also revert patches by putting them in the same folder as the PKGBUILD and giving them the .myrevert extension. + +# Also, userpatches variable below must be set to true for the above to work. +_user_patches="true" + +# Apply all user patches without confirmation - !!! NOT RECOMMENDED !!! +_user_patches_no_confirm="false" + + +#### CONFIG FRAGMENTS #### + +# You can use your own kernel config fragments by putting them in the same folder as the PKGBUILD and giving them the .myfrag extension. + +# Also, the config fragments variable below must be set to true for the above to work. +_config_fragments="true" + +# Apply all config fragments without confirmation - !!! NOT RECOMMENDED !!! +_config_fragments_no_confirm="false" diff --git a/linux55-tkg/linux55-tkg-config/config.x86_64 b/linux55-tkg/linux55-tkg-config/config.x86_64 new file mode 100644 index 0000000..d21a045 --- /dev/null +++ b/linux55-tkg/linux55-tkg-config/config.x86_64 @@ -0,0 +1,10680 @@ +# +# Automatically generated file; DO NOT EDIT. +# Linux/x86 5.5.5-arch1 Kernel Configuration +# + +# +# Compiler: gcc (GCC) 9.2.0 +# +CONFIG_CC_IS_GCC=y +CONFIG_GCC_VERSION=90200 +CONFIG_CLANG_VERSION=0 +CONFIG_CC_CAN_LINK=y +CONFIG_CC_HAS_ASM_GOTO=y +CONFIG_CC_HAS_ASM_INLINE=y +CONFIG_CC_HAS_WARN_MAYBE_UNINITIALIZED=y +CONFIG_IRQ_WORK=y +CONFIG_BUILDTIME_EXTABLE_SORT=y +CONFIG_THREAD_INFO_IN_TASK=y + +# +# General setup +# +CONFIG_INIT_ENV_ARG_LIMIT=32 +# CONFIG_COMPILE_TEST is not set +CONFIG_LOCALVERSION="" +CONFIG_LOCALVERSION_AUTO=y +CONFIG_BUILD_SALT="" +CONFIG_HAVE_KERNEL_GZIP=y +CONFIG_HAVE_KERNEL_BZIP2=y +CONFIG_HAVE_KERNEL_LZMA=y +CONFIG_HAVE_KERNEL_XZ=y +CONFIG_HAVE_KERNEL_LZO=y +CONFIG_HAVE_KERNEL_LZ4=y +# CONFIG_KERNEL_GZIP is not set +# CONFIG_KERNEL_BZIP2 is not set +# CONFIG_KERNEL_LZMA is not set +CONFIG_KERNEL_XZ=y +# CONFIG_KERNEL_LZO is not set +# CONFIG_KERNEL_LZ4 is not set +CONFIG_DEFAULT_HOSTNAME="archlinux" +CONFIG_SWAP=y +CONFIG_SYSVIPC=y +CONFIG_SYSVIPC_SYSCTL=y +CONFIG_POSIX_MQUEUE=y +CONFIG_POSIX_MQUEUE_SYSCTL=y +CONFIG_CROSS_MEMORY_ATTACH=y +# CONFIG_USELIB is not set +CONFIG_AUDIT=y +CONFIG_HAVE_ARCH_AUDITSYSCALL=y +CONFIG_AUDITSYSCALL=y + +# +# IRQ subsystem +# +CONFIG_GENERIC_IRQ_PROBE=y +CONFIG_GENERIC_IRQ_SHOW=y +CONFIG_GENERIC_IRQ_EFFECTIVE_AFF_MASK=y +CONFIG_GENERIC_PENDING_IRQ=y +CONFIG_GENERIC_IRQ_MIGRATION=y +CONFIG_GENERIC_IRQ_CHIP=y +CONFIG_IRQ_DOMAIN=y +CONFIG_IRQ_SIM=y +CONFIG_IRQ_DOMAIN_HIERARCHY=y +CONFIG_GENERIC_MSI_IRQ=y +CONFIG_GENERIC_MSI_IRQ_DOMAIN=y +CONFIG_IRQ_MSI_IOMMU=y +CONFIG_GENERIC_IRQ_MATRIX_ALLOCATOR=y +CONFIG_GENERIC_IRQ_RESERVATION_MODE=y +CONFIG_IRQ_FORCED_THREADING=y +CONFIG_SPARSE_IRQ=y +# CONFIG_GENERIC_IRQ_DEBUGFS is not set +# end of IRQ subsystem + +CONFIG_CLOCKSOURCE_WATCHDOG=y +CONFIG_ARCH_CLOCKSOURCE_DATA=y +CONFIG_ARCH_CLOCKSOURCE_INIT=y +CONFIG_CLOCKSOURCE_VALIDATE_LAST_CYCLE=y +CONFIG_GENERIC_TIME_VSYSCALL=y +CONFIG_GENERIC_CLOCKEVENTS=y +CONFIG_GENERIC_CLOCKEVENTS_BROADCAST=y +CONFIG_GENERIC_CLOCKEVENTS_MIN_ADJUST=y +CONFIG_GENERIC_CMOS_UPDATE=y + +# +# Timers subsystem +# +CONFIG_TICK_ONESHOT=y +CONFIG_NO_HZ_COMMON=y +# CONFIG_HZ_PERIODIC is not set +CONFIG_NO_HZ_IDLE=y +# CONFIG_NO_HZ_FULL is not set +CONFIG_NO_HZ=y +CONFIG_HIGH_RES_TIMERS=y +# end of Timers subsystem + +# CONFIG_PREEMPT_NONE is not set +# CONFIG_PREEMPT_VOLUNTARY is not set +CONFIG_PREEMPT=y +CONFIG_PREEMPT_COUNT=y +CONFIG_PREEMPTION=y + +# +# CPU/Task time and stats accounting +# +CONFIG_TICK_CPU_ACCOUNTING=y +# CONFIG_VIRT_CPU_ACCOUNTING_GEN is not set +CONFIG_IRQ_TIME_ACCOUNTING=y +CONFIG_HAVE_SCHED_AVG_IRQ=y +CONFIG_BSD_PROCESS_ACCT=y +CONFIG_BSD_PROCESS_ACCT_V3=y +CONFIG_TASKSTATS=y +CONFIG_TASK_DELAY_ACCT=y +CONFIG_TASK_XACCT=y +CONFIG_TASK_IO_ACCOUNTING=y +CONFIG_PSI=y +# CONFIG_PSI_DEFAULT_DISABLED is not set +# end of CPU/Task time and stats accounting + +CONFIG_CPU_ISOLATION=y + +# +# RCU Subsystem +# +CONFIG_PREEMPT_RCU=y +CONFIG_RCU_EXPERT=y +CONFIG_SRCU=y +CONFIG_TREE_SRCU=y +CONFIG_TASKS_RCU=y +CONFIG_RCU_STALL_COMMON=y +CONFIG_RCU_NEED_SEGCBLIST=y +CONFIG_RCU_FANOUT=64 +CONFIG_RCU_FANOUT_LEAF=16 +CONFIG_RCU_FAST_NO_HZ=y +CONFIG_RCU_BOOST=y +CONFIG_RCU_BOOST_DELAY=500 +# CONFIG_RCU_NOCB_CPU is not set +# end of RCU Subsystem + +CONFIG_BUILD_BIN2C=y +CONFIG_IKCONFIG=y +CONFIG_IKCONFIG_PROC=y +# CONFIG_IKHEADERS is not set +CONFIG_LOG_BUF_SHIFT=17 +CONFIG_LOG_CPU_MAX_BUF_SHIFT=12 +CONFIG_PRINTK_SAFE_LOG_BUF_SHIFT=13 +CONFIG_HAVE_UNSTABLE_SCHED_CLOCK=y + +# +# Scheduler features +# +CONFIG_UCLAMP_TASK=y +CONFIG_UCLAMP_BUCKETS_COUNT=5 +# end of Scheduler features + +CONFIG_ARCH_SUPPORTS_NUMA_BALANCING=y +CONFIG_ARCH_WANT_BATCHED_UNMAP_TLB_FLUSH=y +CONFIG_CC_HAS_INT128=y +CONFIG_ARCH_SUPPORTS_INT128=y +CONFIG_NUMA_BALANCING=y +CONFIG_NUMA_BALANCING_DEFAULT_ENABLED=y +CONFIG_CGROUPS=y +CONFIG_PAGE_COUNTER=y +CONFIG_MEMCG=y +CONFIG_MEMCG_SWAP=y +CONFIG_MEMCG_SWAP_ENABLED=y +CONFIG_MEMCG_KMEM=y +CONFIG_BLK_CGROUP=y +CONFIG_CGROUP_WRITEBACK=y +CONFIG_CGROUP_SCHED=y +CONFIG_FAIR_GROUP_SCHED=y +CONFIG_CFS_BANDWIDTH=y +# CONFIG_RT_GROUP_SCHED is not set +CONFIG_UCLAMP_TASK_GROUP=y +CONFIG_CGROUP_PIDS=y +CONFIG_CGROUP_RDMA=y +CONFIG_CGROUP_FREEZER=y +CONFIG_CGROUP_HUGETLB=y +CONFIG_CPUSETS=y +CONFIG_PROC_PID_CPUSET=y +CONFIG_CGROUP_DEVICE=y +CONFIG_CGROUP_CPUACCT=y +CONFIG_CGROUP_PERF=y +CONFIG_CGROUP_BPF=y +# CONFIG_CGROUP_DEBUG is not set +CONFIG_SOCK_CGROUP_DATA=y +CONFIG_NAMESPACES=y +CONFIG_UTS_NS=y +CONFIG_IPC_NS=y +CONFIG_USER_NS=y +CONFIG_USER_NS_UNPRIVILEGED=y +CONFIG_PID_NS=y +CONFIG_NET_NS=y +CONFIG_CHECKPOINT_RESTORE=y +CONFIG_SCHED_AUTOGROUP=y +# CONFIG_SYSFS_DEPRECATED is not set +CONFIG_RELAY=y +CONFIG_BLK_DEV_INITRD=y +CONFIG_INITRAMFS_SOURCE="" +CONFIG_RD_GZIP=y +CONFIG_RD_BZIP2=y +CONFIG_RD_LZMA=y +CONFIG_RD_XZ=y +CONFIG_RD_LZO=y +CONFIG_RD_LZ4=y +CONFIG_CC_OPTIMIZE_FOR_PERFORMANCE=y +# CONFIG_CC_OPTIMIZE_FOR_SIZE is not set +CONFIG_SYSCTL=y +CONFIG_HAVE_UID16=y +CONFIG_SYSCTL_EXCEPTION_TRACE=y +CONFIG_HAVE_PCSPKR_PLATFORM=y +CONFIG_BPF=y +CONFIG_EXPERT=y +# CONFIG_UID16 is not set +CONFIG_MULTIUSER=y +CONFIG_SGETMASK_SYSCALL=y +# CONFIG_SYSFS_SYSCALL is not set +CONFIG_FHANDLE=y +CONFIG_POSIX_TIMERS=y +CONFIG_PRINTK=y +CONFIG_PRINTK_NMI=y +CONFIG_BUG=y +CONFIG_ELF_CORE=y +CONFIG_PCSPKR_PLATFORM=y +CONFIG_BASE_FULL=y +CONFIG_FUTEX=y +CONFIG_FUTEX_PI=y +CONFIG_EPOLL=y +CONFIG_SIGNALFD=y +CONFIG_TIMERFD=y +CONFIG_EVENTFD=y +CONFIG_SHMEM=y +CONFIG_AIO=y +CONFIG_IO_URING=y +CONFIG_ADVISE_SYSCALLS=y +CONFIG_MEMBARRIER=y +CONFIG_KALLSYMS=y +CONFIG_KALLSYMS_ALL=y +CONFIG_KALLSYMS_ABSOLUTE_PERCPU=y +CONFIG_KALLSYMS_BASE_RELATIVE=y +CONFIG_BPF_SYSCALL=y +CONFIG_BPF_JIT_ALWAYS_ON=y +# CONFIG_USERFAULTFD is not set +CONFIG_ARCH_HAS_MEMBARRIER_SYNC_CORE=y +CONFIG_RSEQ=y +# CONFIG_DEBUG_RSEQ is not set +# CONFIG_EMBEDDED is not set +CONFIG_HAVE_PERF_EVENTS=y +# CONFIG_PC104 is not set + +# +# Kernel Performance Events And Counters +# +CONFIG_PERF_EVENTS=y +# CONFIG_DEBUG_PERF_USE_VMALLOC is not set +# end of Kernel Performance Events And Counters + +CONFIG_VM_EVENT_COUNTERS=y +CONFIG_SLUB_DEBUG=y +# CONFIG_SLUB_MEMCG_SYSFS_ON is not set +# CONFIG_COMPAT_BRK is not set +# CONFIG_SLAB is not set +CONFIG_SLUB=y +# CONFIG_SLOB is not set +CONFIG_SLAB_MERGE_DEFAULT=y +CONFIG_SLAB_FREELIST_RANDOM=y +CONFIG_SLAB_FREELIST_HARDENED=y +CONFIG_SHUFFLE_PAGE_ALLOCATOR=y +CONFIG_SLUB_CPU_PARTIAL=y +CONFIG_SYSTEM_DATA_VERIFICATION=y +CONFIG_PROFILING=y +CONFIG_TRACEPOINTS=y +# end of General setup + +CONFIG_64BIT=y +CONFIG_X86_64=y +CONFIG_X86=y +CONFIG_INSTRUCTION_DECODER=y +CONFIG_OUTPUT_FORMAT="elf64-x86-64" +CONFIG_ARCH_DEFCONFIG="arch/x86/configs/x86_64_defconfig" +CONFIG_LOCKDEP_SUPPORT=y +CONFIG_STACKTRACE_SUPPORT=y +CONFIG_MMU=y +CONFIG_ARCH_MMAP_RND_BITS_MIN=28 +CONFIG_ARCH_MMAP_RND_BITS_MAX=32 +CONFIG_ARCH_MMAP_RND_COMPAT_BITS_MIN=8 +CONFIG_ARCH_MMAP_RND_COMPAT_BITS_MAX=16 +CONFIG_GENERIC_ISA_DMA=y +CONFIG_GENERIC_BUG=y +CONFIG_GENERIC_BUG_RELATIVE_POINTERS=y +CONFIG_ARCH_MAY_HAVE_PC_FDC=y +CONFIG_GENERIC_CALIBRATE_DELAY=y +CONFIG_ARCH_HAS_CPU_RELAX=y +CONFIG_ARCH_HAS_CACHE_LINE_SIZE=y +CONFIG_ARCH_HAS_FILTER_PGPROT=y +CONFIG_HAVE_SETUP_PER_CPU_AREA=y +CONFIG_NEED_PER_CPU_EMBED_FIRST_CHUNK=y +CONFIG_NEED_PER_CPU_PAGE_FIRST_CHUNK=y +CONFIG_ARCH_HIBERNATION_POSSIBLE=y +CONFIG_ARCH_SUSPEND_POSSIBLE=y +CONFIG_ARCH_WANT_GENERAL_HUGETLB=y +CONFIG_ZONE_DMA32=y +CONFIG_AUDIT_ARCH=y +CONFIG_ARCH_SUPPORTS_DEBUG_PAGEALLOC=y +CONFIG_HAVE_INTEL_TXT=y +CONFIG_X86_64_SMP=y +CONFIG_ARCH_SUPPORTS_UPROBES=y +CONFIG_FIX_EARLYCON_MEM=y +CONFIG_DYNAMIC_PHYSICAL_MASK=y +CONFIG_PGTABLE_LEVELS=5 +CONFIG_CC_HAS_SANE_STACKPROTECTOR=y + +# +# Processor type and features +# +CONFIG_ZONE_DMA=y +CONFIG_SMP=y +CONFIG_X86_FEATURE_NAMES=y +CONFIG_X86_X2APIC=y +CONFIG_X86_MPPARSE=y +# CONFIG_GOLDFISH is not set +CONFIG_RETPOLINE=y +CONFIG_X86_CPU_RESCTRL=y +# CONFIG_X86_EXTENDED_PLATFORM is not set +CONFIG_X86_INTEL_LPSS=y +CONFIG_X86_AMD_PLATFORM_DEVICE=y +CONFIG_IOSF_MBI=y +# CONFIG_IOSF_MBI_DEBUG is not set +CONFIG_X86_SUPPORTS_MEMORY_FAILURE=y +CONFIG_SCHED_OMIT_FRAME_POINTER=y +CONFIG_HYPERVISOR_GUEST=y +CONFIG_PARAVIRT=y +CONFIG_PARAVIRT_XXL=y +# CONFIG_PARAVIRT_DEBUG is not set +CONFIG_PARAVIRT_SPINLOCKS=y +CONFIG_X86_HV_CALLBACK_VECTOR=y +CONFIG_XEN=y +CONFIG_XEN_PV=y +CONFIG_XEN_PV_SMP=y +CONFIG_XEN_DOM0=y +CONFIG_XEN_PVHVM=y +CONFIG_XEN_PVHVM_SMP=y +CONFIG_XEN_512GB=y +CONFIG_XEN_SAVE_RESTORE=y +# CONFIG_XEN_DEBUG_FS is not set +CONFIG_XEN_PVH=y +CONFIG_KVM_GUEST=y +CONFIG_ARCH_CPUIDLE_HALTPOLL=y +CONFIG_PVH=y +# CONFIG_KVM_DEBUG_FS is not set +CONFIG_PARAVIRT_TIME_ACCOUNTING=y +CONFIG_PARAVIRT_CLOCK=y +CONFIG_JAILHOUSE_GUEST=y +CONFIG_ACRN_GUEST=y +# CONFIG_MK8 is not set +# CONFIG_MPSC is not set +# CONFIG_MCORE2 is not set +# CONFIG_MATOM is not set +CONFIG_GENERIC_CPU=y +CONFIG_X86_INTERNODE_CACHE_SHIFT=6 +CONFIG_X86_L1_CACHE_SHIFT=6 +CONFIG_X86_TSC=y +CONFIG_X86_CMPXCHG64=y +CONFIG_X86_CMOV=y +CONFIG_X86_MINIMUM_CPU_FAMILY=64 +CONFIG_X86_DEBUGCTLMSR=y +CONFIG_PROCESSOR_SELECT=y +CONFIG_CPU_SUP_INTEL=y +CONFIG_CPU_SUP_AMD=y +CONFIG_CPU_SUP_HYGON=y +CONFIG_CPU_SUP_CENTAUR=y +CONFIG_CPU_SUP_ZHAOXIN=y +CONFIG_HPET_TIMER=y +CONFIG_HPET_EMULATE_RTC=y +CONFIG_DMI=y +CONFIG_GART_IOMMU=y +# CONFIG_MAXSMP is not set +CONFIG_NR_CPUS_RANGE_BEGIN=2 +CONFIG_NR_CPUS_RANGE_END=512 +CONFIG_NR_CPUS_DEFAULT=64 +CONFIG_NR_CPUS=320 +CONFIG_SCHED_SMT=y +CONFIG_SCHED_MC=y +CONFIG_SCHED_MC_PRIO=y +CONFIG_X86_LOCAL_APIC=y +CONFIG_X86_IO_APIC=y +CONFIG_X86_REROUTE_FOR_BROKEN_BOOT_IRQS=y +CONFIG_X86_MCE=y +# CONFIG_X86_MCELOG_LEGACY is not set +CONFIG_X86_MCE_INTEL=y +CONFIG_X86_MCE_AMD=y +CONFIG_X86_MCE_THRESHOLD=y +CONFIG_X86_MCE_INJECT=m +CONFIG_X86_THERMAL_VECTOR=y + +# +# Performance monitoring +# +CONFIG_PERF_EVENTS_INTEL_UNCORE=m +CONFIG_PERF_EVENTS_INTEL_RAPL=m +CONFIG_PERF_EVENTS_INTEL_CSTATE=m +CONFIG_PERF_EVENTS_AMD_POWER=m +# end of Performance monitoring + +CONFIG_X86_16BIT=y +CONFIG_X86_ESPFIX64=y +CONFIG_X86_VSYSCALL_EMULATION=y +CONFIG_X86_IOPL_IOPERM=y +CONFIG_I8K=m +CONFIG_MICROCODE=y +CONFIG_MICROCODE_INTEL=y +CONFIG_MICROCODE_AMD=y +CONFIG_MICROCODE_OLD_INTERFACE=y +CONFIG_X86_MSR=m +CONFIG_X86_CPUID=m +CONFIG_X86_5LEVEL=y +CONFIG_X86_DIRECT_GBPAGES=y +# CONFIG_X86_CPA_STATISTICS is not set +CONFIG_AMD_MEM_ENCRYPT=y +# CONFIG_AMD_MEM_ENCRYPT_ACTIVE_BY_DEFAULT is not set +CONFIG_NUMA=y +CONFIG_AMD_NUMA=y +CONFIG_X86_64_ACPI_NUMA=y +CONFIG_NODES_SPAN_OTHER_NODES=y +# CONFIG_NUMA_EMU is not set +CONFIG_NODES_SHIFT=5 +CONFIG_ARCH_SPARSEMEM_ENABLE=y +CONFIG_ARCH_SPARSEMEM_DEFAULT=y +CONFIG_ARCH_SELECT_MEMORY_MODEL=y +CONFIG_ARCH_MEMORY_PROBE=y +CONFIG_ARCH_PROC_KCORE_TEXT=y +CONFIG_ILLEGAL_POINTER_VALUE=0xdead000000000000 +CONFIG_X86_PMEM_LEGACY_DEVICE=y +CONFIG_X86_PMEM_LEGACY=m +CONFIG_X86_CHECK_BIOS_CORRUPTION=y +CONFIG_X86_BOOTPARAM_MEMORY_CORRUPTION_CHECK=y +CONFIG_X86_RESERVE_LOW=64 +CONFIG_MTRR=y +CONFIG_MTRR_SANITIZER=y +CONFIG_MTRR_SANITIZER_ENABLE_DEFAULT=1 +CONFIG_MTRR_SANITIZER_SPARE_REG_NR_DEFAULT=0 +CONFIG_X86_PAT=y +CONFIG_ARCH_USES_PG_UNCACHED=y +CONFIG_ARCH_RANDOM=y +CONFIG_X86_SMAP=y +CONFIG_X86_UMIP=y +# CONFIG_X86_INTEL_MPX is not set +CONFIG_X86_INTEL_MEMORY_PROTECTION_KEYS=y +# CONFIG_X86_INTEL_TSX_MODE_OFF is not set +# CONFIG_X86_INTEL_TSX_MODE_ON is not set +CONFIG_X86_INTEL_TSX_MODE_AUTO=y +CONFIG_EFI=y +CONFIG_EFI_STUB=y +CONFIG_EFI_MIXED=y +CONFIG_SECCOMP=y +# CONFIG_HZ_100 is not set +# CONFIG_HZ_250 is not set +CONFIG_HZ_300=y +# CONFIG_HZ_1000 is not set +CONFIG_HZ=300 +CONFIG_SCHED_HRTICK=y +CONFIG_KEXEC=y +CONFIG_KEXEC_FILE=y +CONFIG_ARCH_HAS_KEXEC_PURGATORY=y +# CONFIG_KEXEC_SIG is not set +CONFIG_CRASH_DUMP=y +CONFIG_KEXEC_JUMP=y +CONFIG_PHYSICAL_START=0x1000000 +CONFIG_RELOCATABLE=y +CONFIG_RANDOMIZE_BASE=y +CONFIG_X86_NEED_RELOCS=y +CONFIG_PHYSICAL_ALIGN=0x200000 +CONFIG_DYNAMIC_MEMORY_LAYOUT=y +CONFIG_RANDOMIZE_MEMORY=y +CONFIG_RANDOMIZE_MEMORY_PHYSICAL_PADDING=0x1 +CONFIG_HOTPLUG_CPU=y +# CONFIG_BOOTPARAM_HOTPLUG_CPU0 is not set +# CONFIG_DEBUG_HOTPLUG_CPU0 is not set +# CONFIG_COMPAT_VDSO is not set +# CONFIG_LEGACY_VSYSCALL_EMULATE is not set +CONFIG_LEGACY_VSYSCALL_XONLY=y +# CONFIG_LEGACY_VSYSCALL_NONE is not set +# CONFIG_CMDLINE_BOOL is not set +CONFIG_MODIFY_LDT_SYSCALL=y +CONFIG_HAVE_LIVEPATCH=y +# CONFIG_LIVEPATCH is not set +# end of Processor type and features + +CONFIG_ARCH_HAS_ADD_PAGES=y +CONFIG_ARCH_ENABLE_MEMORY_HOTPLUG=y +CONFIG_ARCH_ENABLE_MEMORY_HOTREMOVE=y +CONFIG_USE_PERCPU_NUMA_NODE_ID=y +CONFIG_ARCH_ENABLE_SPLIT_PMD_PTLOCK=y +CONFIG_ARCH_ENABLE_HUGEPAGE_MIGRATION=y +CONFIG_ARCH_ENABLE_THP_MIGRATION=y + +# +# Power management and ACPI options +# +CONFIG_ARCH_HIBERNATION_HEADER=y +CONFIG_SUSPEND=y +CONFIG_SUSPEND_FREEZER=y +# CONFIG_SUSPEND_SKIP_SYNC is not set +CONFIG_HIBERNATE_CALLBACKS=y +CONFIG_HIBERNATION=y +CONFIG_PM_STD_PARTITION="" +CONFIG_PM_SLEEP=y +CONFIG_PM_SLEEP_SMP=y +CONFIG_PM_AUTOSLEEP=y +CONFIG_PM_WAKELOCKS=y +CONFIG_PM_WAKELOCKS_LIMIT=100 +CONFIG_PM_WAKELOCKS_GC=y +CONFIG_PM=y +CONFIG_PM_DEBUG=y +CONFIG_PM_ADVANCED_DEBUG=y +# CONFIG_PM_TEST_SUSPEND is not set +CONFIG_PM_SLEEP_DEBUG=y +# CONFIG_DPM_WATCHDOG is not set +CONFIG_PM_TRACE=y +CONFIG_PM_TRACE_RTC=y +CONFIG_PM_CLK=y +CONFIG_PM_GENERIC_DOMAINS=y +CONFIG_WQ_POWER_EFFICIENT_DEFAULT=y +CONFIG_PM_GENERIC_DOMAINS_SLEEP=y +CONFIG_PM_GENERIC_DOMAINS_OF=y +CONFIG_ENERGY_MODEL=y +CONFIG_ARCH_SUPPORTS_ACPI=y +CONFIG_ACPI=y +CONFIG_ACPI_LEGACY_TABLES_LOOKUP=y +CONFIG_ARCH_MIGHT_HAVE_ACPI_PDC=y +CONFIG_ACPI_SYSTEM_POWER_STATES_SUPPORT=y +# CONFIG_ACPI_DEBUGGER is not set +CONFIG_ACPI_SPCR_TABLE=y +CONFIG_ACPI_LPIT=y +CONFIG_ACPI_SLEEP=y +# CONFIG_ACPI_PROCFS_POWER is not set +CONFIG_ACPI_REV_OVERRIDE_POSSIBLE=y +CONFIG_ACPI_EC_DEBUGFS=y +CONFIG_ACPI_AC=m +CONFIG_ACPI_BATTERY=m +CONFIG_ACPI_BUTTON=y +CONFIG_ACPI_VIDEO=y +CONFIG_ACPI_FAN=y +CONFIG_ACPI_TAD=m +CONFIG_ACPI_DOCK=y +CONFIG_ACPI_CPU_FREQ_PSS=y +CONFIG_ACPI_PROCESSOR_CSTATE=y +CONFIG_ACPI_PROCESSOR_IDLE=y +CONFIG_ACPI_CPPC_LIB=y +CONFIG_ACPI_PROCESSOR=y +CONFIG_ACPI_IPMI=m +CONFIG_ACPI_HOTPLUG_CPU=y +CONFIG_ACPI_PROCESSOR_AGGREGATOR=y +CONFIG_ACPI_THERMAL=y +CONFIG_ARCH_HAS_ACPI_TABLE_UPGRADE=y +CONFIG_ACPI_TABLE_UPGRADE=y +CONFIG_ACPI_DEBUG=y +CONFIG_ACPI_PCI_SLOT=y +CONFIG_ACPI_CONTAINER=y +CONFIG_ACPI_HOTPLUG_MEMORY=y +CONFIG_ACPI_HOTPLUG_IOAPIC=y +CONFIG_ACPI_SBS=m +CONFIG_ACPI_HED=y +CONFIG_ACPI_CUSTOM_METHOD=m +CONFIG_ACPI_BGRT=y +# CONFIG_ACPI_REDUCED_HARDWARE_ONLY is not set +CONFIG_ACPI_NFIT=m +# CONFIG_NFIT_SECURITY_DEBUG is not set +CONFIG_ACPI_NUMA=y +CONFIG_ACPI_HMAT=y +CONFIG_HAVE_ACPI_APEI=y +CONFIG_HAVE_ACPI_APEI_NMI=y +CONFIG_ACPI_APEI=y +CONFIG_ACPI_APEI_GHES=y +CONFIG_ACPI_APEI_PCIEAER=y +CONFIG_ACPI_APEI_MEMORY_FAILURE=y +CONFIG_ACPI_APEI_EINJ=m +CONFIG_ACPI_APEI_ERST_DEBUG=m +CONFIG_DPTF_POWER=m +CONFIG_ACPI_WATCHDOG=y +CONFIG_ACPI_EXTLOG=m +CONFIG_ACPI_ADXL=y +CONFIG_PMIC_OPREGION=y +CONFIG_BYTCRC_PMIC_OPREGION=y +CONFIG_CHTCRC_PMIC_OPREGION=y +CONFIG_XPOWER_PMIC_OPREGION=y +CONFIG_BXT_WC_PMIC_OPREGION=y +CONFIG_CHT_WC_PMIC_OPREGION=y +CONFIG_CHT_DC_TI_PMIC_OPREGION=y +CONFIG_ACPI_CONFIGFS=m +CONFIG_TPS68470_PMIC_OPREGION=y +CONFIG_X86_PM_TIMER=y +CONFIG_SFI=y + +# +# CPU Frequency scaling +# +CONFIG_CPU_FREQ=y +CONFIG_CPU_FREQ_GOV_ATTR_SET=y +CONFIG_CPU_FREQ_GOV_COMMON=y +CONFIG_CPU_FREQ_STAT=y +# CONFIG_CPU_FREQ_DEFAULT_GOV_PERFORMANCE is not set +# CONFIG_CPU_FREQ_DEFAULT_GOV_POWERSAVE is not set +# CONFIG_CPU_FREQ_DEFAULT_GOV_USERSPACE is not set +# CONFIG_CPU_FREQ_DEFAULT_GOV_ONDEMAND is not set +# CONFIG_CPU_FREQ_DEFAULT_GOV_CONSERVATIVE is not set +CONFIG_CPU_FREQ_DEFAULT_GOV_SCHEDUTIL=y +CONFIG_CPU_FREQ_GOV_PERFORMANCE=y +CONFIG_CPU_FREQ_GOV_POWERSAVE=m +CONFIG_CPU_FREQ_GOV_USERSPACE=m +CONFIG_CPU_FREQ_GOV_ONDEMAND=m +CONFIG_CPU_FREQ_GOV_CONSERVATIVE=m +CONFIG_CPU_FREQ_GOV_SCHEDUTIL=y + +# +# CPU frequency scaling drivers +# +CONFIG_CPUFREQ_DT=m +CONFIG_CPUFREQ_DT_PLATDEV=y +CONFIG_X86_INTEL_PSTATE=y +CONFIG_X86_PCC_CPUFREQ=m +CONFIG_X86_ACPI_CPUFREQ=m +CONFIG_X86_ACPI_CPUFREQ_CPB=y +CONFIG_X86_POWERNOW_K8=m +CONFIG_X86_AMD_FREQ_SENSITIVITY=m +# CONFIG_X86_SPEEDSTEP_CENTRINO is not set +CONFIG_X86_P4_CLOCKMOD=m + +# +# shared options +# +CONFIG_X86_SPEEDSTEP_LIB=m +# end of CPU Frequency scaling + +# +# CPU Idle +# +CONFIG_CPU_IDLE=y +CONFIG_CPU_IDLE_GOV_LADDER=y +CONFIG_CPU_IDLE_GOV_MENU=y +CONFIG_CPU_IDLE_GOV_TEO=y +CONFIG_CPU_IDLE_GOV_HALTPOLL=y +CONFIG_HALTPOLL_CPUIDLE=m +# end of CPU Idle + +CONFIG_INTEL_IDLE=y +# end of Power management and ACPI options + +# +# Bus options (PCI etc.) +# +CONFIG_PCI_DIRECT=y +CONFIG_PCI_MMCONFIG=y +CONFIG_PCI_XEN=y +CONFIG_MMCONF_FAM10H=y +# CONFIG_PCI_CNB20LE_QUIRK is not set +# CONFIG_ISA_BUS is not set +CONFIG_ISA_DMA_API=y +CONFIG_AMD_NB=y +# CONFIG_X86_SYSFB is not set +# end of Bus options (PCI etc.) + +# +# Binary Emulations +# +CONFIG_IA32_EMULATION=y +# CONFIG_X86_X32 is not set +CONFIG_COMPAT_32=y +CONFIG_COMPAT=y +CONFIG_COMPAT_FOR_U64_ALIGNMENT=y +CONFIG_SYSVIPC_COMPAT=y +# end of Binary Emulations + +CONFIG_X86_DEV_DMA_OPS=y + +# +# Firmware Drivers +# +CONFIG_EDD=m +# CONFIG_EDD_OFF is not set +CONFIG_FIRMWARE_MEMMAP=y +CONFIG_DMIID=y +CONFIG_DMI_SYSFS=m +CONFIG_DMI_SCAN_MACHINE_NON_EFI_FALLBACK=y +CONFIG_ISCSI_IBFT_FIND=y +CONFIG_ISCSI_IBFT=m +CONFIG_FW_CFG_SYSFS=m +# CONFIG_FW_CFG_SYSFS_CMDLINE is not set +CONFIG_GOOGLE_FIRMWARE=y +# CONFIG_GOOGLE_SMI is not set +CONFIG_GOOGLE_COREBOOT_TABLE=m +CONFIG_GOOGLE_MEMCONSOLE=m +# CONFIG_GOOGLE_MEMCONSOLE_X86_LEGACY is not set +CONFIG_GOOGLE_FRAMEBUFFER_COREBOOT=m +CONFIG_GOOGLE_MEMCONSOLE_COREBOOT=m +CONFIG_GOOGLE_VPD=m + +# +# EFI (Extensible Firmware Interface) Support +# +# CONFIG_EFI_VARS is not set +CONFIG_EFI_ESRT=y +CONFIG_EFI_RUNTIME_MAP=y +# CONFIG_EFI_FAKE_MEMMAP is not set +CONFIG_EFI_SOFT_RESERVE=y +CONFIG_EFI_RUNTIME_WRAPPERS=y +CONFIG_EFI_CAPSULE_LOADER=m +# CONFIG_EFI_TEST is not set +CONFIG_APPLE_PROPERTIES=y +# CONFIG_RESET_ATTACK_MITIGATION is not set +CONFIG_EFI_RCI2_TABLE=y +# end of EFI (Extensible Firmware Interface) Support + +CONFIG_UEFI_CPER=y +CONFIG_UEFI_CPER_X86=y +CONFIG_EFI_DEV_PATH_PARSER=y +CONFIG_EFI_EARLYCON=y + +# +# Tegra firmware driver +# +# end of Tegra firmware driver +# end of Firmware Drivers + +CONFIG_HAVE_KVM=y +CONFIG_HAVE_KVM_IRQCHIP=y +CONFIG_HAVE_KVM_IRQFD=y +CONFIG_HAVE_KVM_IRQ_ROUTING=y +CONFIG_HAVE_KVM_EVENTFD=y +CONFIG_KVM_MMIO=y +CONFIG_KVM_ASYNC_PF=y +CONFIG_HAVE_KVM_MSI=y +CONFIG_HAVE_KVM_CPU_RELAX_INTERCEPT=y +CONFIG_KVM_VFIO=y +CONFIG_KVM_GENERIC_DIRTYLOG_READ_PROTECT=y +CONFIG_KVM_COMPAT=y +CONFIG_HAVE_KVM_IRQ_BYPASS=y +CONFIG_HAVE_KVM_NO_POLL=y +CONFIG_VIRTUALIZATION=y +CONFIG_KVM=m +CONFIG_KVM_INTEL=m +CONFIG_KVM_AMD=m +CONFIG_KVM_AMD_SEV=y +CONFIG_KVM_MMU_AUDIT=y +CONFIG_VHOST_NET=m +CONFIG_VHOST_SCSI=m +CONFIG_VHOST_VSOCK=m +CONFIG_VHOST=m +# CONFIG_VHOST_CROSS_ENDIAN_LEGACY is not set + +# +# General architecture-dependent options +# +CONFIG_CRASH_CORE=y +CONFIG_KEXEC_CORE=y +CONFIG_HOTPLUG_SMT=y +CONFIG_OPROFILE=m +# CONFIG_OPROFILE_EVENT_MULTIPLEX is not set +CONFIG_HAVE_OPROFILE=y +CONFIG_OPROFILE_NMI_TIMER=y +CONFIG_KPROBES=y +CONFIG_JUMP_LABEL=y +# CONFIG_STATIC_KEYS_SELFTEST is not set +CONFIG_OPTPROBES=y +CONFIG_KPROBES_ON_FTRACE=y +CONFIG_UPROBES=y +CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS=y +CONFIG_ARCH_USE_BUILTIN_BSWAP=y +CONFIG_KRETPROBES=y +CONFIG_USER_RETURN_NOTIFIER=y +CONFIG_HAVE_IOREMAP_PROT=y +CONFIG_HAVE_KPROBES=y +CONFIG_HAVE_KRETPROBES=y +CONFIG_HAVE_OPTPROBES=y +CONFIG_HAVE_KPROBES_ON_FTRACE=y +CONFIG_HAVE_FUNCTION_ERROR_INJECTION=y +CONFIG_HAVE_NMI=y +CONFIG_HAVE_ARCH_TRACEHOOK=y +CONFIG_HAVE_DMA_CONTIGUOUS=y +CONFIG_GENERIC_SMP_IDLE_THREAD=y +CONFIG_ARCH_HAS_FORTIFY_SOURCE=y +CONFIG_ARCH_HAS_SET_MEMORY=y +CONFIG_ARCH_HAS_SET_DIRECT_MAP=y +CONFIG_HAVE_ARCH_THREAD_STRUCT_WHITELIST=y +CONFIG_ARCH_WANTS_DYNAMIC_TASK_STRUCT=y +CONFIG_HAVE_ASM_MODVERSIONS=y +CONFIG_HAVE_REGS_AND_STACK_ACCESS_API=y +CONFIG_HAVE_RSEQ=y +CONFIG_HAVE_FUNCTION_ARG_ACCESS_API=y +CONFIG_HAVE_CLK=y +CONFIG_HAVE_HW_BREAKPOINT=y +CONFIG_HAVE_MIXED_BREAKPOINTS_REGS=y +CONFIG_HAVE_USER_RETURN_NOTIFIER=y +CONFIG_HAVE_PERF_EVENTS_NMI=y +CONFIG_HAVE_HARDLOCKUP_DETECTOR_PERF=y +CONFIG_HAVE_PERF_REGS=y +CONFIG_HAVE_PERF_USER_STACK_DUMP=y +CONFIG_HAVE_ARCH_JUMP_LABEL=y +CONFIG_HAVE_ARCH_JUMP_LABEL_RELATIVE=y +CONFIG_HAVE_RCU_TABLE_FREE=y +CONFIG_ARCH_HAVE_NMI_SAFE_CMPXCHG=y +CONFIG_HAVE_ALIGNED_STRUCT_PAGE=y +CONFIG_HAVE_CMPXCHG_LOCAL=y +CONFIG_HAVE_CMPXCHG_DOUBLE=y +CONFIG_ARCH_WANT_COMPAT_IPC_PARSE_VERSION=y +CONFIG_ARCH_WANT_OLD_COMPAT_IPC=y +CONFIG_HAVE_ARCH_SECCOMP_FILTER=y +CONFIG_SECCOMP_FILTER=y +CONFIG_HAVE_ARCH_STACKLEAK=y +CONFIG_HAVE_STACKPROTECTOR=y +CONFIG_CC_HAS_STACKPROTECTOR_NONE=y +CONFIG_STACKPROTECTOR=y +CONFIG_STACKPROTECTOR_STRONG=y +CONFIG_HAVE_ARCH_WITHIN_STACK_FRAMES=y +CONFIG_HAVE_CONTEXT_TRACKING=y +CONFIG_HAVE_VIRT_CPU_ACCOUNTING_GEN=y +CONFIG_HAVE_IRQ_TIME_ACCOUNTING=y +CONFIG_HAVE_MOVE_PMD=y +CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE=y +CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD=y +CONFIG_HAVE_ARCH_HUGE_VMAP=y +CONFIG_ARCH_WANT_HUGE_PMD_SHARE=y +CONFIG_HAVE_ARCH_SOFT_DIRTY=y +CONFIG_HAVE_MOD_ARCH_SPECIFIC=y +CONFIG_MODULES_USE_ELF_RELA=y +CONFIG_HAVE_IRQ_EXIT_ON_IRQ_STACK=y +CONFIG_ARCH_HAS_ELF_RANDOMIZE=y +CONFIG_HAVE_ARCH_MMAP_RND_BITS=y +CONFIG_HAVE_EXIT_THREAD=y +CONFIG_ARCH_MMAP_RND_BITS=28 +CONFIG_HAVE_ARCH_MMAP_RND_COMPAT_BITS=y +CONFIG_ARCH_MMAP_RND_COMPAT_BITS=8 +CONFIG_HAVE_ARCH_COMPAT_MMAP_BASES=y +CONFIG_HAVE_COPY_THREAD_TLS=y +CONFIG_HAVE_STACK_VALIDATION=y +CONFIG_HAVE_RELIABLE_STACKTRACE=y +CONFIG_ISA_BUS_API=y +CONFIG_OLD_SIGSUSPEND3=y +CONFIG_COMPAT_OLD_SIGACTION=y +CONFIG_COMPAT_32BIT_TIME=y +CONFIG_HAVE_ARCH_VMAP_STACK=y +CONFIG_VMAP_STACK=y +CONFIG_ARCH_HAS_STRICT_KERNEL_RWX=y +CONFIG_STRICT_KERNEL_RWX=y +CONFIG_ARCH_HAS_STRICT_MODULE_RWX=y +CONFIG_STRICT_MODULE_RWX=y +CONFIG_HAVE_ARCH_PREL32_RELOCATIONS=y +CONFIG_ARCH_USE_MEMREMAP_PROT=y +CONFIG_LOCK_EVENT_COUNTS=y +CONFIG_ARCH_HAS_MEM_ENCRYPT=y + +# +# GCOV-based kernel profiling +# +# CONFIG_GCOV_KERNEL is not set +CONFIG_ARCH_HAS_GCOV_PROFILE_ALL=y +# end of GCOV-based kernel profiling + +CONFIG_PLUGIN_HOSTCC="g++" +CONFIG_HAVE_GCC_PLUGINS=y +CONFIG_GCC_PLUGINS=y +# CONFIG_GCC_PLUGIN_CYC_COMPLEXITY is not set +# CONFIG_GCC_PLUGIN_LATENT_ENTROPY is not set +# CONFIG_GCC_PLUGIN_RANDSTRUCT is not set +# end of General architecture-dependent options + +CONFIG_RT_MUTEXES=y +CONFIG_BASE_SMALL=0 +CONFIG_MODULE_SIG_FORMAT=y +CONFIG_MODULES=y +CONFIG_MODULE_FORCE_LOAD=y +CONFIG_MODULE_UNLOAD=y +CONFIG_MODULE_FORCE_UNLOAD=y +# CONFIG_MODVERSIONS is not set +CONFIG_MODULE_SRCVERSION_ALL=y +CONFIG_MODULE_SIG=y +# CONFIG_MODULE_SIG_FORCE is not set +CONFIG_MODULE_SIG_ALL=y +# CONFIG_MODULE_SIG_SHA1 is not set +# CONFIG_MODULE_SIG_SHA224 is not set +# CONFIG_MODULE_SIG_SHA256 is not set +# CONFIG_MODULE_SIG_SHA384 is not set +CONFIG_MODULE_SIG_SHA512=y +CONFIG_MODULE_SIG_HASH="sha512" +CONFIG_MODULE_COMPRESS=y +# CONFIG_MODULE_COMPRESS_GZIP is not set +CONFIG_MODULE_COMPRESS_XZ=y +CONFIG_MODULE_ALLOW_MISSING_NAMESPACE_IMPORTS=y +CONFIG_UNUSED_SYMBOLS=y +CONFIG_MODULES_TREE_LOOKUP=y +CONFIG_BLOCK=y +CONFIG_BLK_RQ_ALLOC_TIME=y +CONFIG_BLK_SCSI_REQUEST=y +CONFIG_BLK_CGROUP_RWSTAT=y +CONFIG_BLK_DEV_BSG=y +CONFIG_BLK_DEV_BSGLIB=y +CONFIG_BLK_DEV_INTEGRITY=y +CONFIG_BLK_DEV_ZONED=y +CONFIG_BLK_DEV_THROTTLING=y +CONFIG_BLK_DEV_THROTTLING_LOW=y +# CONFIG_BLK_CMDLINE_PARSER is not set +CONFIG_BLK_WBT=y +CONFIG_BLK_CGROUP_IOLATENCY=y +CONFIG_BLK_CGROUP_IOCOST=y +CONFIG_BLK_WBT_MQ=y +CONFIG_BLK_DEBUG_FS=y +CONFIG_BLK_DEBUG_FS_ZONED=y +CONFIG_BLK_SED_OPAL=y + +# +# Partition Types +# +CONFIG_PARTITION_ADVANCED=y +# CONFIG_ACORN_PARTITION is not set +CONFIG_AIX_PARTITION=y +# CONFIG_OSF_PARTITION is not set +# CONFIG_AMIGA_PARTITION is not set +# CONFIG_ATARI_PARTITION is not set +CONFIG_MAC_PARTITION=y +CONFIG_MSDOS_PARTITION=y +CONFIG_BSD_DISKLABEL=y +CONFIG_MINIX_SUBPARTITION=y +CONFIG_SOLARIS_X86_PARTITION=y +# CONFIG_UNIXWARE_DISKLABEL is not set +CONFIG_LDM_PARTITION=y +# CONFIG_LDM_DEBUG is not set +# CONFIG_SGI_PARTITION is not set +# CONFIG_ULTRIX_PARTITION is not set +# CONFIG_SUN_PARTITION is not set +CONFIG_KARMA_PARTITION=y +CONFIG_EFI_PARTITION=y +# CONFIG_SYSV68_PARTITION is not set +# CONFIG_CMDLINE_PARTITION is not set +# end of Partition Types + +CONFIG_BLOCK_COMPAT=y +CONFIG_BLK_MQ_PCI=y +CONFIG_BLK_MQ_VIRTIO=y +CONFIG_BLK_MQ_RDMA=y +CONFIG_BLK_PM=y + +# +# IO Schedulers +# +CONFIG_MQ_IOSCHED_DEADLINE=y +CONFIG_MQ_IOSCHED_KYBER=y +CONFIG_IOSCHED_BFQ=y +CONFIG_BFQ_GROUP_IOSCHED=y +# CONFIG_BFQ_CGROUP_DEBUG is not set +# end of IO Schedulers + +CONFIG_PREEMPT_NOTIFIERS=y +CONFIG_PADATA=y +CONFIG_ASN1=y +CONFIG_UNINLINE_SPIN_UNLOCK=y +CONFIG_ARCH_SUPPORTS_ATOMIC_RMW=y +CONFIG_MUTEX_SPIN_ON_OWNER=y +CONFIG_RWSEM_SPIN_ON_OWNER=y +CONFIG_LOCK_SPIN_ON_OWNER=y +CONFIG_ARCH_USE_QUEUED_SPINLOCKS=y +CONFIG_QUEUED_SPINLOCKS=y +CONFIG_ARCH_USE_QUEUED_RWLOCKS=y +CONFIG_QUEUED_RWLOCKS=y +CONFIG_ARCH_HAS_SYNC_CORE_BEFORE_USERMODE=y +CONFIG_ARCH_HAS_SYSCALL_WRAPPER=y +CONFIG_FREEZER=y + +# +# Executable file formats +# +CONFIG_BINFMT_ELF=y +CONFIG_COMPAT_BINFMT_ELF=y +CONFIG_ELFCORE=y +CONFIG_CORE_DUMP_DEFAULT_ELF_HEADERS=y +CONFIG_BINFMT_SCRIPT=y +CONFIG_BINFMT_MISC=y +CONFIG_COREDUMP=y +# end of Executable file formats + +# +# Memory Management options +# +CONFIG_SELECT_MEMORY_MODEL=y +CONFIG_SPARSEMEM_MANUAL=y +CONFIG_SPARSEMEM=y +CONFIG_NEED_MULTIPLE_NODES=y +CONFIG_HAVE_MEMORY_PRESENT=y +CONFIG_SPARSEMEM_EXTREME=y +CONFIG_SPARSEMEM_VMEMMAP_ENABLE=y +CONFIG_SPARSEMEM_VMEMMAP=y +CONFIG_HAVE_MEMBLOCK_NODE_MAP=y +CONFIG_HAVE_FAST_GUP=y +CONFIG_MEMORY_ISOLATION=y +CONFIG_HAVE_BOOTMEM_INFO_NODE=y +CONFIG_MEMORY_HOTPLUG=y +CONFIG_MEMORY_HOTPLUG_SPARSE=y +CONFIG_MEMORY_HOTPLUG_DEFAULT_ONLINE=y +CONFIG_MEMORY_HOTREMOVE=y +CONFIG_SPLIT_PTLOCK_CPUS=4 +CONFIG_MEMORY_BALLOON=y +CONFIG_BALLOON_COMPACTION=y +CONFIG_COMPACTION=y +CONFIG_MIGRATION=y +CONFIG_CONTIG_ALLOC=y +CONFIG_PHYS_ADDR_T_64BIT=y +CONFIG_BOUNCE=y +CONFIG_VIRT_TO_BUS=y +CONFIG_MMU_NOTIFIER=y +CONFIG_KSM=y +CONFIG_DEFAULT_MMAP_MIN_ADDR=65536 +CONFIG_ARCH_SUPPORTS_MEMORY_FAILURE=y +CONFIG_MEMORY_FAILURE=y +CONFIG_HWPOISON_INJECT=m +CONFIG_TRANSPARENT_HUGEPAGE=y +# CONFIG_TRANSPARENT_HUGEPAGE_ALWAYS is not set +CONFIG_TRANSPARENT_HUGEPAGE_MADVISE=y +CONFIG_ARCH_WANTS_THP_SWAP=y +CONFIG_THP_SWAP=y +CONFIG_TRANSPARENT_HUGE_PAGECACHE=y +CONFIG_CLEANCACHE=y +CONFIG_FRONTSWAP=y +# CONFIG_CMA is not set +# CONFIG_MEM_SOFT_DIRTY is not set +CONFIG_ZSWAP=y +CONFIG_ZPOOL=y +CONFIG_ZBUD=y +CONFIG_Z3FOLD=y +CONFIG_ZSMALLOC=y +# CONFIG_PGTABLE_MAPPING is not set +# CONFIG_ZSMALLOC_STAT is not set +CONFIG_GENERIC_EARLY_IOREMAP=y +# CONFIG_DEFERRED_STRUCT_PAGE_INIT is not set +# CONFIG_IDLE_PAGE_TRACKING is not set +CONFIG_ARCH_HAS_PTE_DEVMAP=y +CONFIG_ZONE_DEVICE=y +CONFIG_DEV_PAGEMAP_OPS=y +CONFIG_HMM_MIRROR=y +CONFIG_DEVICE_PRIVATE=y +CONFIG_FRAME_VECTOR=y +CONFIG_ARCH_USES_HIGH_VMA_FLAGS=y +CONFIG_ARCH_HAS_PKEYS=y +# CONFIG_PERCPU_STATS is not set +# CONFIG_GUP_BENCHMARK is not set +CONFIG_READ_ONLY_THP_FOR_FS=y +CONFIG_ARCH_HAS_PTE_SPECIAL=y +CONFIG_MAPPING_DIRTY_HELPERS=y +# end of Memory Management options + +CONFIG_NET=y +CONFIG_COMPAT_NETLINK_MESSAGES=y +CONFIG_NET_INGRESS=y +CONFIG_NET_EGRESS=y +CONFIG_SKB_EXTENSIONS=y + +# +# Networking options +# +CONFIG_PACKET=y +CONFIG_PACKET_DIAG=y +CONFIG_UNIX=y +CONFIG_UNIX_SCM=y +CONFIG_UNIX_DIAG=y +CONFIG_TLS=m +CONFIG_TLS_DEVICE=y +# CONFIG_TLS_TOE is not set +CONFIG_XFRM=y +CONFIG_XFRM_OFFLOAD=y +CONFIG_XFRM_ALGO=m +CONFIG_XFRM_USER=m +CONFIG_XFRM_INTERFACE=m +CONFIG_XFRM_SUB_POLICY=y +CONFIG_XFRM_MIGRATE=y +CONFIG_XFRM_STATISTICS=y +CONFIG_XFRM_IPCOMP=m +CONFIG_NET_KEY=m +CONFIG_NET_KEY_MIGRATE=y +CONFIG_SMC=m +CONFIG_SMC_DIAG=m +CONFIG_XDP_SOCKETS=y +CONFIG_XDP_SOCKETS_DIAG=y +CONFIG_INET=y +CONFIG_IP_MULTICAST=y +CONFIG_IP_ADVANCED_ROUTER=y +# CONFIG_IP_FIB_TRIE_STATS is not set +CONFIG_IP_MULTIPLE_TABLES=y +CONFIG_IP_ROUTE_MULTIPATH=y +CONFIG_IP_ROUTE_VERBOSE=y +CONFIG_IP_ROUTE_CLASSID=y +# CONFIG_IP_PNP is not set +CONFIG_NET_IPIP=m +CONFIG_NET_IPGRE_DEMUX=m +CONFIG_NET_IP_TUNNEL=m +CONFIG_NET_IPGRE=m +# CONFIG_NET_IPGRE_BROADCAST is not set +CONFIG_IP_MROUTE_COMMON=y +CONFIG_IP_MROUTE=y +CONFIG_IP_MROUTE_MULTIPLE_TABLES=y +CONFIG_IP_PIMSM_V1=y +CONFIG_IP_PIMSM_V2=y +CONFIG_SYN_COOKIES=y +CONFIG_NET_IPVTI=m +CONFIG_NET_UDP_TUNNEL=m +CONFIG_NET_FOU=m +CONFIG_NET_FOU_IP_TUNNELS=y +CONFIG_INET_AH=m +CONFIG_INET_ESP=m +CONFIG_INET_ESP_OFFLOAD=m +CONFIG_INET_IPCOMP=m +CONFIG_INET_XFRM_TUNNEL=m +CONFIG_INET_TUNNEL=m +CONFIG_INET_DIAG=m +CONFIG_INET_TCP_DIAG=m +CONFIG_INET_UDP_DIAG=m +CONFIG_INET_RAW_DIAG=m +CONFIG_INET_DIAG_DESTROY=y +CONFIG_TCP_CONG_ADVANCED=y +CONFIG_TCP_CONG_BIC=m +CONFIG_TCP_CONG_CUBIC=y +CONFIG_TCP_CONG_WESTWOOD=m +CONFIG_TCP_CONG_HTCP=m +CONFIG_TCP_CONG_HSTCP=m +CONFIG_TCP_CONG_HYBLA=m +CONFIG_TCP_CONG_VEGAS=m +CONFIG_TCP_CONG_NV=m +CONFIG_TCP_CONG_SCALABLE=m +CONFIG_TCP_CONG_LP=m +CONFIG_TCP_CONG_VENO=m +CONFIG_TCP_CONG_YEAH=m +CONFIG_TCP_CONG_ILLINOIS=m +CONFIG_TCP_CONG_DCTCP=m +CONFIG_TCP_CONG_CDG=m +CONFIG_TCP_CONG_BBR=m +CONFIG_DEFAULT_CUBIC=y +# CONFIG_DEFAULT_RENO is not set +CONFIG_DEFAULT_TCP_CONG="cubic" +CONFIG_TCP_MD5SIG=y +CONFIG_IPV6=y +CONFIG_IPV6_ROUTER_PREF=y +CONFIG_IPV6_ROUTE_INFO=y +CONFIG_IPV6_OPTIMISTIC_DAD=y +CONFIG_INET6_AH=m +CONFIG_INET6_ESP=m +CONFIG_INET6_ESP_OFFLOAD=m +CONFIG_INET6_IPCOMP=m +CONFIG_IPV6_MIP6=m +CONFIG_IPV6_ILA=m +CONFIG_INET6_XFRM_TUNNEL=m +CONFIG_INET6_TUNNEL=m +CONFIG_IPV6_VTI=m +CONFIG_IPV6_SIT=m +CONFIG_IPV6_SIT_6RD=y +CONFIG_IPV6_NDISC_NODETYPE=y +CONFIG_IPV6_TUNNEL=m +CONFIG_IPV6_GRE=m +CONFIG_IPV6_FOU=m +CONFIG_IPV6_FOU_TUNNEL=m +CONFIG_IPV6_MULTIPLE_TABLES=y +CONFIG_IPV6_SUBTREES=y +CONFIG_IPV6_MROUTE=y +CONFIG_IPV6_MROUTE_MULTIPLE_TABLES=y +CONFIG_IPV6_PIMSM_V2=y +CONFIG_IPV6_SEG6_LWTUNNEL=y +CONFIG_IPV6_SEG6_HMAC=y +CONFIG_IPV6_SEG6_BPF=y +CONFIG_NETLABEL=y +CONFIG_NETWORK_SECMARK=y +CONFIG_NET_PTP_CLASSIFY=y +CONFIG_NETWORK_PHY_TIMESTAMPING=y +CONFIG_NETFILTER=y +CONFIG_NETFILTER_ADVANCED=y +CONFIG_BRIDGE_NETFILTER=m + +# +# Core Netfilter Configuration +# +CONFIG_NETFILTER_INGRESS=y +CONFIG_NETFILTER_NETLINK=m +CONFIG_NETFILTER_FAMILY_BRIDGE=y +CONFIG_NETFILTER_FAMILY_ARP=y +CONFIG_NETFILTER_NETLINK_ACCT=m +CONFIG_NETFILTER_NETLINK_QUEUE=m +CONFIG_NETFILTER_NETLINK_LOG=m +CONFIG_NETFILTER_NETLINK_OSF=m +CONFIG_NF_CONNTRACK=m +CONFIG_NF_LOG_COMMON=m +CONFIG_NF_LOG_NETDEV=m +CONFIG_NETFILTER_CONNCOUNT=m +CONFIG_NF_CONNTRACK_MARK=y +CONFIG_NF_CONNTRACK_SECMARK=y +CONFIG_NF_CONNTRACK_ZONES=y +CONFIG_NF_CONNTRACK_PROCFS=y +CONFIG_NF_CONNTRACK_EVENTS=y +CONFIG_NF_CONNTRACK_TIMEOUT=y +CONFIG_NF_CONNTRACK_TIMESTAMP=y +CONFIG_NF_CONNTRACK_LABELS=y +CONFIG_NF_CT_PROTO_DCCP=y +CONFIG_NF_CT_PROTO_GRE=y +CONFIG_NF_CT_PROTO_SCTP=y +CONFIG_NF_CT_PROTO_UDPLITE=y +CONFIG_NF_CONNTRACK_AMANDA=m +CONFIG_NF_CONNTRACK_FTP=m +CONFIG_NF_CONNTRACK_H323=m +CONFIG_NF_CONNTRACK_IRC=m +CONFIG_NF_CONNTRACK_BROADCAST=m +CONFIG_NF_CONNTRACK_NETBIOS_NS=m +CONFIG_NF_CONNTRACK_SNMP=m +CONFIG_NF_CONNTRACK_PPTP=m +CONFIG_NF_CONNTRACK_SANE=m +CONFIG_NF_CONNTRACK_SIP=m +CONFIG_NF_CONNTRACK_TFTP=m +CONFIG_NF_CT_NETLINK=m +CONFIG_NF_CT_NETLINK_TIMEOUT=m +CONFIG_NF_CT_NETLINK_HELPER=m +CONFIG_NETFILTER_NETLINK_GLUE_CT=y +CONFIG_NF_NAT=m +CONFIG_NF_NAT_AMANDA=m +CONFIG_NF_NAT_FTP=m +CONFIG_NF_NAT_IRC=m +CONFIG_NF_NAT_SIP=m +CONFIG_NF_NAT_TFTP=m +CONFIG_NF_NAT_REDIRECT=y +CONFIG_NF_NAT_MASQUERADE=y +CONFIG_NETFILTER_SYNPROXY=m +CONFIG_NF_TABLES=m +CONFIG_NF_TABLES_SET=m +CONFIG_NF_TABLES_INET=y +CONFIG_NF_TABLES_NETDEV=y +CONFIG_NFT_NUMGEN=m +CONFIG_NFT_CT=m +CONFIG_NFT_FLOW_OFFLOAD=m +CONFIG_NFT_COUNTER=m +CONFIG_NFT_CONNLIMIT=m +CONFIG_NFT_LOG=m +CONFIG_NFT_LIMIT=m +CONFIG_NFT_MASQ=m +CONFIG_NFT_REDIR=m +CONFIG_NFT_NAT=m +CONFIG_NFT_TUNNEL=m +CONFIG_NFT_OBJREF=m +CONFIG_NFT_QUEUE=m +CONFIG_NFT_QUOTA=m +CONFIG_NFT_REJECT=m +CONFIG_NFT_REJECT_INET=m +CONFIG_NFT_COMPAT=m +CONFIG_NFT_HASH=m +CONFIG_NFT_FIB=m +CONFIG_NFT_FIB_INET=m +CONFIG_NFT_XFRM=m +CONFIG_NFT_SOCKET=m +CONFIG_NFT_OSF=m +CONFIG_NFT_TPROXY=m +CONFIG_NFT_SYNPROXY=m +CONFIG_NF_DUP_NETDEV=m +CONFIG_NFT_DUP_NETDEV=m +CONFIG_NFT_FWD_NETDEV=m +CONFIG_NFT_FIB_NETDEV=m +CONFIG_NF_FLOW_TABLE_INET=m +CONFIG_NF_FLOW_TABLE=m +CONFIG_NETFILTER_XTABLES=m + +# +# Xtables combined modules +# +CONFIG_NETFILTER_XT_MARK=m +CONFIG_NETFILTER_XT_CONNMARK=m +CONFIG_NETFILTER_XT_SET=m + +# +# Xtables targets +# +CONFIG_NETFILTER_XT_TARGET_AUDIT=m +CONFIG_NETFILTER_XT_TARGET_CHECKSUM=m +CONFIG_NETFILTER_XT_TARGET_CLASSIFY=m +CONFIG_NETFILTER_XT_TARGET_CONNMARK=m +CONFIG_NETFILTER_XT_TARGET_CONNSECMARK=m +CONFIG_NETFILTER_XT_TARGET_CT=m +CONFIG_NETFILTER_XT_TARGET_DSCP=m +CONFIG_NETFILTER_XT_TARGET_HL=m +CONFIG_NETFILTER_XT_TARGET_HMARK=m +CONFIG_NETFILTER_XT_TARGET_IDLETIMER=m +CONFIG_NETFILTER_XT_TARGET_LED=m +CONFIG_NETFILTER_XT_TARGET_LOG=m +CONFIG_NETFILTER_XT_TARGET_MARK=m +CONFIG_NETFILTER_XT_NAT=m +CONFIG_NETFILTER_XT_TARGET_NETMAP=m +CONFIG_NETFILTER_XT_TARGET_NFLOG=m +CONFIG_NETFILTER_XT_TARGET_NFQUEUE=m +CONFIG_NETFILTER_XT_TARGET_NOTRACK=m +CONFIG_NETFILTER_XT_TARGET_RATEEST=m +CONFIG_NETFILTER_XT_TARGET_REDIRECT=m +CONFIG_NETFILTER_XT_TARGET_MASQUERADE=m +CONFIG_NETFILTER_XT_TARGET_TEE=m +CONFIG_NETFILTER_XT_TARGET_TPROXY=m +CONFIG_NETFILTER_XT_TARGET_TRACE=m +CONFIG_NETFILTER_XT_TARGET_SECMARK=m +CONFIG_NETFILTER_XT_TARGET_TCPMSS=m +CONFIG_NETFILTER_XT_TARGET_TCPOPTSTRIP=m + +# +# Xtables matches +# +CONFIG_NETFILTER_XT_MATCH_ADDRTYPE=m +CONFIG_NETFILTER_XT_MATCH_BPF=m +CONFIG_NETFILTER_XT_MATCH_CGROUP=m +CONFIG_NETFILTER_XT_MATCH_CLUSTER=m +CONFIG_NETFILTER_XT_MATCH_COMMENT=m +CONFIG_NETFILTER_XT_MATCH_CONNBYTES=m +CONFIG_NETFILTER_XT_MATCH_CONNLABEL=m +CONFIG_NETFILTER_XT_MATCH_CONNLIMIT=m +CONFIG_NETFILTER_XT_MATCH_CONNMARK=m +CONFIG_NETFILTER_XT_MATCH_CONNTRACK=m +CONFIG_NETFILTER_XT_MATCH_CPU=m +CONFIG_NETFILTER_XT_MATCH_DCCP=m +CONFIG_NETFILTER_XT_MATCH_DEVGROUP=m +CONFIG_NETFILTER_XT_MATCH_DSCP=m +CONFIG_NETFILTER_XT_MATCH_ECN=m +CONFIG_NETFILTER_XT_MATCH_ESP=m +CONFIG_NETFILTER_XT_MATCH_HASHLIMIT=m +CONFIG_NETFILTER_XT_MATCH_HELPER=m +CONFIG_NETFILTER_XT_MATCH_HL=m +CONFIG_NETFILTER_XT_MATCH_IPCOMP=m +CONFIG_NETFILTER_XT_MATCH_IPRANGE=m +CONFIG_NETFILTER_XT_MATCH_IPVS=m +CONFIG_NETFILTER_XT_MATCH_L2TP=m +CONFIG_NETFILTER_XT_MATCH_LENGTH=m +CONFIG_NETFILTER_XT_MATCH_LIMIT=m +CONFIG_NETFILTER_XT_MATCH_MAC=m +CONFIG_NETFILTER_XT_MATCH_MARK=m +CONFIG_NETFILTER_XT_MATCH_MULTIPORT=m +CONFIG_NETFILTER_XT_MATCH_NFACCT=m +CONFIG_NETFILTER_XT_MATCH_OSF=m +CONFIG_NETFILTER_XT_MATCH_OWNER=m +CONFIG_NETFILTER_XT_MATCH_POLICY=m +CONFIG_NETFILTER_XT_MATCH_PHYSDEV=m +CONFIG_NETFILTER_XT_MATCH_PKTTYPE=m +CONFIG_NETFILTER_XT_MATCH_QUOTA=m +CONFIG_NETFILTER_XT_MATCH_RATEEST=m +CONFIG_NETFILTER_XT_MATCH_REALM=m +CONFIG_NETFILTER_XT_MATCH_RECENT=m +CONFIG_NETFILTER_XT_MATCH_SCTP=m +CONFIG_NETFILTER_XT_MATCH_SOCKET=m +CONFIG_NETFILTER_XT_MATCH_STATE=m +CONFIG_NETFILTER_XT_MATCH_STATISTIC=m +CONFIG_NETFILTER_XT_MATCH_STRING=m +CONFIG_NETFILTER_XT_MATCH_TCPMSS=m +CONFIG_NETFILTER_XT_MATCH_TIME=m +CONFIG_NETFILTER_XT_MATCH_U32=m +# end of Core Netfilter Configuration + +CONFIG_IP_SET=m +CONFIG_IP_SET_MAX=256 +CONFIG_IP_SET_BITMAP_IP=m +CONFIG_IP_SET_BITMAP_IPMAC=m +CONFIG_IP_SET_BITMAP_PORT=m +CONFIG_IP_SET_HASH_IP=m +CONFIG_IP_SET_HASH_IPMARK=m +CONFIG_IP_SET_HASH_IPPORT=m +CONFIG_IP_SET_HASH_IPPORTIP=m +CONFIG_IP_SET_HASH_IPPORTNET=m +CONFIG_IP_SET_HASH_IPMAC=m +CONFIG_IP_SET_HASH_MAC=m +CONFIG_IP_SET_HASH_NETPORTNET=m +CONFIG_IP_SET_HASH_NET=m +CONFIG_IP_SET_HASH_NETNET=m +CONFIG_IP_SET_HASH_NETPORT=m +CONFIG_IP_SET_HASH_NETIFACE=m +CONFIG_IP_SET_LIST_SET=m +CONFIG_IP_VS=m +CONFIG_IP_VS_IPV6=y +# CONFIG_IP_VS_DEBUG is not set +CONFIG_IP_VS_TAB_BITS=15 + +# +# IPVS transport protocol load balancing support +# +CONFIG_IP_VS_PROTO_TCP=y +CONFIG_IP_VS_PROTO_UDP=y +CONFIG_IP_VS_PROTO_AH_ESP=y +CONFIG_IP_VS_PROTO_ESP=y +CONFIG_IP_VS_PROTO_AH=y +CONFIG_IP_VS_PROTO_SCTP=y + +# +# IPVS scheduler +# +CONFIG_IP_VS_RR=m +CONFIG_IP_VS_WRR=m +CONFIG_IP_VS_LC=m +CONFIG_IP_VS_WLC=m +CONFIG_IP_VS_FO=m +CONFIG_IP_VS_OVF=m +CONFIG_IP_VS_LBLC=m +CONFIG_IP_VS_LBLCR=m +CONFIG_IP_VS_DH=m +CONFIG_IP_VS_SH=m +CONFIG_IP_VS_MH=m +CONFIG_IP_VS_SED=m +CONFIG_IP_VS_NQ=m + +# +# IPVS SH scheduler +# +CONFIG_IP_VS_SH_TAB_BITS=8 + +# +# IPVS MH scheduler +# +CONFIG_IP_VS_MH_TAB_INDEX=12 + +# +# IPVS application helper +# +CONFIG_IP_VS_FTP=m +CONFIG_IP_VS_NFCT=y +CONFIG_IP_VS_PE_SIP=m + +# +# IP: Netfilter Configuration +# +CONFIG_NF_DEFRAG_IPV4=m +CONFIG_NF_SOCKET_IPV4=m +CONFIG_NF_TPROXY_IPV4=m +CONFIG_NF_TABLES_IPV4=y +CONFIG_NFT_REJECT_IPV4=m +CONFIG_NFT_DUP_IPV4=m +CONFIG_NFT_FIB_IPV4=m +CONFIG_NF_TABLES_ARP=y +CONFIG_NF_FLOW_TABLE_IPV4=m +CONFIG_NF_DUP_IPV4=m +CONFIG_NF_LOG_ARP=m +CONFIG_NF_LOG_IPV4=m +CONFIG_NF_REJECT_IPV4=m +CONFIG_NF_NAT_SNMP_BASIC=m +CONFIG_NF_NAT_PPTP=m +CONFIG_NF_NAT_H323=m +CONFIG_IP_NF_IPTABLES=m +CONFIG_IP_NF_MATCH_AH=m +CONFIG_IP_NF_MATCH_ECN=m +CONFIG_IP_NF_MATCH_RPFILTER=m +CONFIG_IP_NF_MATCH_TTL=m +CONFIG_IP_NF_FILTER=m +CONFIG_IP_NF_TARGET_REJECT=m +CONFIG_IP_NF_TARGET_SYNPROXY=m +CONFIG_IP_NF_NAT=m +CONFIG_IP_NF_TARGET_MASQUERADE=m +CONFIG_IP_NF_TARGET_NETMAP=m +CONFIG_IP_NF_TARGET_REDIRECT=m +CONFIG_IP_NF_MANGLE=m +CONFIG_IP_NF_TARGET_CLUSTERIP=m +CONFIG_IP_NF_TARGET_ECN=m +CONFIG_IP_NF_TARGET_TTL=m +CONFIG_IP_NF_RAW=m +CONFIG_IP_NF_SECURITY=m +CONFIG_IP_NF_ARPTABLES=m +CONFIG_IP_NF_ARPFILTER=m +CONFIG_IP_NF_ARP_MANGLE=m +# end of IP: Netfilter Configuration + +# +# IPv6: Netfilter Configuration +# +CONFIG_NF_SOCKET_IPV6=m +CONFIG_NF_TPROXY_IPV6=m +CONFIG_NF_TABLES_IPV6=y +CONFIG_NFT_REJECT_IPV6=m +CONFIG_NFT_DUP_IPV6=m +CONFIG_NFT_FIB_IPV6=m +CONFIG_NF_FLOW_TABLE_IPV6=m +CONFIG_NF_DUP_IPV6=m +CONFIG_NF_REJECT_IPV6=m +CONFIG_NF_LOG_IPV6=m +CONFIG_IP6_NF_IPTABLES=m +CONFIG_IP6_NF_MATCH_AH=m +CONFIG_IP6_NF_MATCH_EUI64=m +CONFIG_IP6_NF_MATCH_FRAG=m +CONFIG_IP6_NF_MATCH_OPTS=m +CONFIG_IP6_NF_MATCH_HL=m +CONFIG_IP6_NF_MATCH_IPV6HEADER=m +CONFIG_IP6_NF_MATCH_MH=m +CONFIG_IP6_NF_MATCH_RPFILTER=m +CONFIG_IP6_NF_MATCH_RT=m +CONFIG_IP6_NF_MATCH_SRH=m +CONFIG_IP6_NF_TARGET_HL=m +CONFIG_IP6_NF_FILTER=m +CONFIG_IP6_NF_TARGET_REJECT=m +CONFIG_IP6_NF_TARGET_SYNPROXY=m +CONFIG_IP6_NF_MANGLE=m +CONFIG_IP6_NF_RAW=m +CONFIG_IP6_NF_SECURITY=m +CONFIG_IP6_NF_NAT=m +CONFIG_IP6_NF_TARGET_MASQUERADE=m +CONFIG_IP6_NF_TARGET_NPT=m +# end of IPv6: Netfilter Configuration + +CONFIG_NF_DEFRAG_IPV6=m +CONFIG_NF_TABLES_BRIDGE=m +CONFIG_NFT_BRIDGE_META=m +CONFIG_NFT_BRIDGE_REJECT=m +CONFIG_NF_LOG_BRIDGE=m +CONFIG_NF_CONNTRACK_BRIDGE=m +CONFIG_BRIDGE_NF_EBTABLES=m +CONFIG_BRIDGE_EBT_BROUTE=m +CONFIG_BRIDGE_EBT_T_FILTER=m +CONFIG_BRIDGE_EBT_T_NAT=m +CONFIG_BRIDGE_EBT_802_3=m +CONFIG_BRIDGE_EBT_AMONG=m +CONFIG_BRIDGE_EBT_ARP=m +CONFIG_BRIDGE_EBT_IP=m +CONFIG_BRIDGE_EBT_IP6=m +CONFIG_BRIDGE_EBT_LIMIT=m +CONFIG_BRIDGE_EBT_MARK=m +CONFIG_BRIDGE_EBT_PKTTYPE=m +CONFIG_BRIDGE_EBT_STP=m +CONFIG_BRIDGE_EBT_VLAN=m +CONFIG_BRIDGE_EBT_ARPREPLY=m +CONFIG_BRIDGE_EBT_DNAT=m +CONFIG_BRIDGE_EBT_MARK_T=m +CONFIG_BRIDGE_EBT_REDIRECT=m +CONFIG_BRIDGE_EBT_SNAT=m +CONFIG_BRIDGE_EBT_LOG=m +CONFIG_BRIDGE_EBT_NFLOG=m +# CONFIG_BPFILTER is not set +CONFIG_IP_DCCP=m +CONFIG_INET_DCCP_DIAG=m + +# +# DCCP CCIDs Configuration +# +# CONFIG_IP_DCCP_CCID2_DEBUG is not set +CONFIG_IP_DCCP_CCID3=y +# CONFIG_IP_DCCP_CCID3_DEBUG is not set +CONFIG_IP_DCCP_TFRC_LIB=y +# end of DCCP CCIDs Configuration + +# +# DCCP Kernel Hacking +# +# CONFIG_IP_DCCP_DEBUG is not set +# end of DCCP Kernel Hacking + +CONFIG_IP_SCTP=m +# CONFIG_SCTP_DBG_OBJCNT is not set +# CONFIG_SCTP_DEFAULT_COOKIE_HMAC_MD5 is not set +CONFIG_SCTP_DEFAULT_COOKIE_HMAC_SHA1=y +# CONFIG_SCTP_DEFAULT_COOKIE_HMAC_NONE is not set +CONFIG_SCTP_COOKIE_HMAC_MD5=y +CONFIG_SCTP_COOKIE_HMAC_SHA1=y +CONFIG_INET_SCTP_DIAG=m +CONFIG_RDS=m +CONFIG_RDS_RDMA=m +CONFIG_RDS_TCP=m +# CONFIG_RDS_DEBUG is not set +CONFIG_TIPC=m +CONFIG_TIPC_MEDIA_IB=y +CONFIG_TIPC_MEDIA_UDP=y +CONFIG_TIPC_CRYPTO=y +CONFIG_TIPC_DIAG=m +CONFIG_ATM=m +CONFIG_ATM_CLIP=m +# CONFIG_ATM_CLIP_NO_ICMP is not set +CONFIG_ATM_LANE=m +CONFIG_ATM_MPOA=m +CONFIG_ATM_BR2684=m +# CONFIG_ATM_BR2684_IPFILTER is not set +CONFIG_L2TP=m +# CONFIG_L2TP_DEBUGFS is not set +CONFIG_L2TP_V3=y +CONFIG_L2TP_IP=m +CONFIG_L2TP_ETH=m +CONFIG_STP=m +CONFIG_GARP=m +CONFIG_MRP=m +CONFIG_BRIDGE=m +CONFIG_BRIDGE_IGMP_SNOOPING=y +CONFIG_BRIDGE_VLAN_FILTERING=y +CONFIG_HAVE_NET_DSA=y +CONFIG_NET_DSA=m +CONFIG_NET_DSA_TAG_8021Q=m +CONFIG_NET_DSA_TAG_BRCM_COMMON=m +CONFIG_NET_DSA_TAG_BRCM=m +CONFIG_NET_DSA_TAG_BRCM_PREPEND=m +CONFIG_NET_DSA_TAG_GSWIP=m +CONFIG_NET_DSA_TAG_DSA=m +CONFIG_NET_DSA_TAG_EDSA=m +CONFIG_NET_DSA_TAG_MTK=m +CONFIG_NET_DSA_TAG_KSZ=m +CONFIG_NET_DSA_TAG_OCELOT=m +CONFIG_NET_DSA_TAG_QCA=m +CONFIG_NET_DSA_TAG_LAN9303=m +CONFIG_NET_DSA_TAG_SJA1105=m +CONFIG_NET_DSA_TAG_TRAILER=m +CONFIG_VLAN_8021Q=m +CONFIG_VLAN_8021Q_GVRP=y +CONFIG_VLAN_8021Q_MVRP=y +# CONFIG_DECNET is not set +CONFIG_LLC=m +CONFIG_LLC2=m +# CONFIG_ATALK is not set +# CONFIG_X25 is not set +# CONFIG_LAPB is not set +CONFIG_PHONET=m +CONFIG_6LOWPAN=m +# CONFIG_6LOWPAN_DEBUGFS is not set +CONFIG_6LOWPAN_NHC=m +CONFIG_6LOWPAN_NHC_DEST=m +CONFIG_6LOWPAN_NHC_FRAGMENT=m +CONFIG_6LOWPAN_NHC_HOP=m +CONFIG_6LOWPAN_NHC_IPV6=m +CONFIG_6LOWPAN_NHC_MOBILITY=m +CONFIG_6LOWPAN_NHC_ROUTING=m +CONFIG_6LOWPAN_NHC_UDP=m +CONFIG_6LOWPAN_GHC_EXT_HDR_HOP=m +CONFIG_6LOWPAN_GHC_UDP=m +CONFIG_6LOWPAN_GHC_ICMPV6=m +CONFIG_6LOWPAN_GHC_EXT_HDR_DEST=m +CONFIG_6LOWPAN_GHC_EXT_HDR_FRAG=m +CONFIG_6LOWPAN_GHC_EXT_HDR_ROUTE=m +CONFIG_IEEE802154=m +CONFIG_IEEE802154_NL802154_EXPERIMENTAL=y +CONFIG_IEEE802154_SOCKET=m +CONFIG_IEEE802154_6LOWPAN=m +CONFIG_MAC802154=m +CONFIG_NET_SCHED=y + +# +# Queueing/Scheduling +# +CONFIG_NET_SCH_CBQ=m +CONFIG_NET_SCH_HTB=m +CONFIG_NET_SCH_HFSC=m +CONFIG_NET_SCH_ATM=m +CONFIG_NET_SCH_PRIO=m +CONFIG_NET_SCH_MULTIQ=m +CONFIG_NET_SCH_RED=m +CONFIG_NET_SCH_SFB=m +CONFIG_NET_SCH_SFQ=m +CONFIG_NET_SCH_TEQL=m +CONFIG_NET_SCH_TBF=m +CONFIG_NET_SCH_CBS=m +CONFIG_NET_SCH_ETF=m +CONFIG_NET_SCH_TAPRIO=m +CONFIG_NET_SCH_GRED=m +CONFIG_NET_SCH_DSMARK=m +CONFIG_NET_SCH_NETEM=m +CONFIG_NET_SCH_DRR=m +CONFIG_NET_SCH_MQPRIO=m +CONFIG_NET_SCH_SKBPRIO=m +CONFIG_NET_SCH_CHOKE=m +CONFIG_NET_SCH_QFQ=m +CONFIG_NET_SCH_CODEL=m +CONFIG_NET_SCH_FQ_CODEL=y +CONFIG_NET_SCH_CAKE=m +CONFIG_NET_SCH_FQ=m +CONFIG_NET_SCH_HHF=m +CONFIG_NET_SCH_PIE=m +CONFIG_NET_SCH_INGRESS=m +CONFIG_NET_SCH_PLUG=m +CONFIG_NET_SCH_DEFAULT=y +# CONFIG_DEFAULT_FQ is not set +# CONFIG_DEFAULT_CODEL is not set +CONFIG_DEFAULT_FQ_CODEL=y +# CONFIG_DEFAULT_SFQ is not set +# CONFIG_DEFAULT_PFIFO_FAST is not set +CONFIG_DEFAULT_NET_SCH="fq_codel" + +# +# Classification +# +CONFIG_NET_CLS=y +CONFIG_NET_CLS_BASIC=m +CONFIG_NET_CLS_TCINDEX=m +CONFIG_NET_CLS_ROUTE4=m +CONFIG_NET_CLS_FW=m +CONFIG_NET_CLS_U32=m +CONFIG_CLS_U32_PERF=y +CONFIG_CLS_U32_MARK=y +CONFIG_NET_CLS_RSVP=m +CONFIG_NET_CLS_RSVP6=m +CONFIG_NET_CLS_FLOW=m +CONFIG_NET_CLS_CGROUP=m +CONFIG_NET_CLS_BPF=m +CONFIG_NET_CLS_FLOWER=m +CONFIG_NET_CLS_MATCHALL=m +CONFIG_NET_EMATCH=y +CONFIG_NET_EMATCH_STACK=32 +CONFIG_NET_EMATCH_CMP=m +CONFIG_NET_EMATCH_NBYTE=m +CONFIG_NET_EMATCH_U32=m +CONFIG_NET_EMATCH_META=m +CONFIG_NET_EMATCH_TEXT=m +CONFIG_NET_EMATCH_CANID=m +CONFIG_NET_EMATCH_IPSET=m +CONFIG_NET_EMATCH_IPT=m +CONFIG_NET_CLS_ACT=y +CONFIG_NET_ACT_POLICE=m +CONFIG_NET_ACT_GACT=m +CONFIG_GACT_PROB=y +CONFIG_NET_ACT_MIRRED=m +CONFIG_NET_ACT_SAMPLE=m +CONFIG_NET_ACT_IPT=m +CONFIG_NET_ACT_NAT=m +CONFIG_NET_ACT_PEDIT=m +CONFIG_NET_ACT_SIMP=m +CONFIG_NET_ACT_SKBEDIT=m +CONFIG_NET_ACT_CSUM=m +CONFIG_NET_ACT_MPLS=m +CONFIG_NET_ACT_VLAN=m +CONFIG_NET_ACT_BPF=m +CONFIG_NET_ACT_CONNMARK=m +CONFIG_NET_ACT_CTINFO=m +CONFIG_NET_ACT_SKBMOD=m +CONFIG_NET_ACT_IFE=m +CONFIG_NET_ACT_TUNNEL_KEY=m +CONFIG_NET_ACT_CT=m +CONFIG_NET_IFE_SKBMARK=m +CONFIG_NET_IFE_SKBPRIO=m +CONFIG_NET_IFE_SKBTCINDEX=m +CONFIG_NET_TC_SKB_EXT=y +CONFIG_NET_SCH_FIFO=y +CONFIG_DCB=y +CONFIG_DNS_RESOLVER=m +CONFIG_BATMAN_ADV=m +CONFIG_BATMAN_ADV_BATMAN_V=y +CONFIG_BATMAN_ADV_BLA=y +CONFIG_BATMAN_ADV_DAT=y +CONFIG_BATMAN_ADV_NC=y +CONFIG_BATMAN_ADV_MCAST=y +CONFIG_BATMAN_ADV_DEBUGFS=y +# CONFIG_BATMAN_ADV_DEBUG is not set +CONFIG_BATMAN_ADV_SYSFS=y +# CONFIG_BATMAN_ADV_TRACING is not set +CONFIG_OPENVSWITCH=m +CONFIG_OPENVSWITCH_GRE=m +CONFIG_OPENVSWITCH_VXLAN=m +CONFIG_OPENVSWITCH_GENEVE=m +CONFIG_VSOCKETS=m +CONFIG_VSOCKETS_DIAG=m +CONFIG_VMWARE_VMCI_VSOCKETS=m +CONFIG_VIRTIO_VSOCKETS=m +CONFIG_VIRTIO_VSOCKETS_COMMON=m +CONFIG_HYPERV_VSOCKETS=m +CONFIG_NETLINK_DIAG=m +CONFIG_MPLS=y +CONFIG_NET_MPLS_GSO=m +CONFIG_MPLS_ROUTING=m +CONFIG_MPLS_IPTUNNEL=m +CONFIG_NET_NSH=m +CONFIG_HSR=m +CONFIG_NET_SWITCHDEV=y +CONFIG_NET_L3_MASTER_DEV=y +CONFIG_NET_NCSI=y +CONFIG_NCSI_OEM_CMD_GET_MAC=y +CONFIG_RPS=y +CONFIG_RFS_ACCEL=y +CONFIG_XPS=y +CONFIG_CGROUP_NET_PRIO=y +CONFIG_CGROUP_NET_CLASSID=y +CONFIG_NET_RX_BUSY_POLL=y +CONFIG_BQL=y +CONFIG_BPF_JIT=y +CONFIG_BPF_STREAM_PARSER=y +CONFIG_NET_FLOW_LIMIT=y + +# +# Network testing +# +CONFIG_NET_PKTGEN=m +CONFIG_NET_DROP_MONITOR=y +# end of Network testing +# end of Networking options + +CONFIG_HAMRADIO=y + +# +# Packet Radio protocols +# +CONFIG_AX25=m +CONFIG_AX25_DAMA_SLAVE=y +CONFIG_NETROM=m +CONFIG_ROSE=m + +# +# AX.25 network device drivers +# +CONFIG_MKISS=m +CONFIG_6PACK=m +CONFIG_BPQETHER=m +CONFIG_BAYCOM_SER_FDX=m +CONFIG_BAYCOM_SER_HDX=m +CONFIG_BAYCOM_PAR=m +CONFIG_YAM=m +# end of AX.25 network device drivers + +CONFIG_CAN=m +CONFIG_CAN_RAW=m +CONFIG_CAN_BCM=m +CONFIG_CAN_GW=m +CONFIG_CAN_J1939=m + +# +# CAN Device Drivers +# +CONFIG_CAN_VCAN=m +CONFIG_CAN_VXCAN=m +CONFIG_CAN_SLCAN=m +CONFIG_CAN_DEV=m +CONFIG_CAN_CALC_BITTIMING=y +CONFIG_CAN_FLEXCAN=m +CONFIG_CAN_GRCAN=m +CONFIG_CAN_JANZ_ICAN3=m +CONFIG_CAN_KVASER_PCIEFD=m +CONFIG_CAN_C_CAN=m +CONFIG_CAN_C_CAN_PLATFORM=m +CONFIG_CAN_C_CAN_PCI=m +CONFIG_CAN_CC770=m +# CONFIG_CAN_CC770_ISA is not set +CONFIG_CAN_CC770_PLATFORM=m +CONFIG_CAN_IFI_CANFD=m +CONFIG_CAN_M_CAN=m +CONFIG_CAN_M_CAN_PLATFORM=m +CONFIG_CAN_M_CAN_TCAN4X5X=m +CONFIG_CAN_PEAK_PCIEFD=m +CONFIG_CAN_SJA1000=m +CONFIG_CAN_EMS_PCI=m +# CONFIG_CAN_EMS_PCMCIA is not set +CONFIG_CAN_F81601=m +CONFIG_CAN_KVASER_PCI=m +CONFIG_CAN_PEAK_PCI=m +CONFIG_CAN_PEAK_PCIEC=y +CONFIG_CAN_PEAK_PCMCIA=m +CONFIG_CAN_PLX_PCI=m +# CONFIG_CAN_SJA1000_ISA is not set +CONFIG_CAN_SJA1000_PLATFORM=m +CONFIG_CAN_SOFTING=m +CONFIG_CAN_SOFTING_CS=m + +# +# CAN SPI interfaces +# +CONFIG_CAN_HI311X=m +CONFIG_CAN_MCP251X=m +# end of CAN SPI interfaces + +# +# CAN USB interfaces +# +CONFIG_CAN_8DEV_USB=m +CONFIG_CAN_EMS_USB=m +CONFIG_CAN_ESD_USB2=m +CONFIG_CAN_GS_USB=m +CONFIG_CAN_KVASER_USB=m +CONFIG_CAN_MCBA_USB=m +CONFIG_CAN_PEAK_USB=m +CONFIG_CAN_UCAN=m +# end of CAN USB interfaces + +# CONFIG_CAN_DEBUG_DEVICES is not set +# end of CAN Device Drivers + +CONFIG_BT=m +CONFIG_BT_BREDR=y +CONFIG_BT_RFCOMM=m +CONFIG_BT_RFCOMM_TTY=y +CONFIG_BT_BNEP=m +CONFIG_BT_BNEP_MC_FILTER=y +CONFIG_BT_BNEP_PROTO_FILTER=y +CONFIG_BT_CMTP=m +CONFIG_BT_HIDP=m +CONFIG_BT_HS=y +CONFIG_BT_LE=y +CONFIG_BT_6LOWPAN=m +CONFIG_BT_LEDS=y +# CONFIG_BT_SELFTEST is not set +CONFIG_BT_DEBUGFS=y + +# +# Bluetooth device drivers +# +CONFIG_BT_INTEL=m +CONFIG_BT_BCM=m +CONFIG_BT_RTL=m +CONFIG_BT_QCA=m +CONFIG_BT_HCIBTUSB=m +CONFIG_BT_HCIBTUSB_AUTOSUSPEND=y +CONFIG_BT_HCIBTUSB_BCM=y +CONFIG_BT_HCIBTUSB_MTK=y +CONFIG_BT_HCIBTUSB_RTL=y +CONFIG_BT_HCIBTSDIO=m +CONFIG_BT_HCIUART=m +CONFIG_BT_HCIUART_SERDEV=y +CONFIG_BT_HCIUART_H4=y +CONFIG_BT_HCIUART_NOKIA=m +CONFIG_BT_HCIUART_BCSP=y +CONFIG_BT_HCIUART_ATH3K=y +CONFIG_BT_HCIUART_LL=y +CONFIG_BT_HCIUART_3WIRE=y +CONFIG_BT_HCIUART_INTEL=y +CONFIG_BT_HCIUART_BCM=y +CONFIG_BT_HCIUART_RTL=y +CONFIG_BT_HCIUART_QCA=y +CONFIG_BT_HCIUART_AG6XX=y +CONFIG_BT_HCIUART_MRVL=y +CONFIG_BT_HCIBCM203X=m +CONFIG_BT_HCIBPA10X=m +CONFIG_BT_HCIBFUSB=m +CONFIG_BT_HCIDTL1=m +CONFIG_BT_HCIBT3C=m +CONFIG_BT_HCIBLUECARD=m +CONFIG_BT_HCIVHCI=m +CONFIG_BT_MRVL=m +CONFIG_BT_MRVL_SDIO=m +CONFIG_BT_ATH3K=m +CONFIG_BT_MTKSDIO=m +CONFIG_BT_MTKUART=m +CONFIG_BT_HCIRSI=m +# end of Bluetooth device drivers + +CONFIG_AF_RXRPC=m +CONFIG_AF_RXRPC_IPV6=y +# CONFIG_AF_RXRPC_INJECT_LOSS is not set +CONFIG_AF_RXRPC_DEBUG=y +CONFIG_RXKAD=y +CONFIG_AF_KCM=m +CONFIG_STREAM_PARSER=y +CONFIG_FIB_RULES=y +CONFIG_WIRELESS=y +CONFIG_WIRELESS_EXT=y +CONFIG_WEXT_CORE=y +CONFIG_WEXT_PROC=y +CONFIG_WEXT_SPY=y +CONFIG_WEXT_PRIV=y +CONFIG_CFG80211=m +# CONFIG_NL80211_TESTMODE is not set +# CONFIG_CFG80211_DEVELOPER_WARNINGS is not set +# CONFIG_CFG80211_CERTIFICATION_ONUS is not set +CONFIG_CFG80211_REQUIRE_SIGNED_REGDB=y +CONFIG_CFG80211_USE_KERNEL_REGDB_KEYS=y +CONFIG_CFG80211_DEFAULT_PS=y +CONFIG_CFG80211_DEBUGFS=y +CONFIG_CFG80211_CRDA_SUPPORT=y +CONFIG_CFG80211_WEXT=y +CONFIG_CFG80211_WEXT_EXPORT=y +CONFIG_LIB80211=m +CONFIG_LIB80211_CRYPT_WEP=m +CONFIG_LIB80211_CRYPT_CCMP=m +CONFIG_LIB80211_CRYPT_TKIP=m +# CONFIG_LIB80211_DEBUG is not set +CONFIG_MAC80211=m +CONFIG_MAC80211_HAS_RC=y +CONFIG_MAC80211_RC_MINSTREL=y +CONFIG_MAC80211_RC_DEFAULT_MINSTREL=y +CONFIG_MAC80211_RC_DEFAULT="minstrel_ht" +CONFIG_MAC80211_MESH=y +CONFIG_MAC80211_LEDS=y +CONFIG_MAC80211_DEBUGFS=y +# CONFIG_MAC80211_MESSAGE_TRACING is not set +# CONFIG_MAC80211_DEBUG_MENU is not set +CONFIG_MAC80211_STA_HASH_MAX_SIZE=0 +CONFIG_WIMAX=m +CONFIG_WIMAX_DEBUG_LEVEL=8 +CONFIG_RFKILL=m +CONFIG_RFKILL_LEDS=y +CONFIG_RFKILL_INPUT=y +CONFIG_RFKILL_GPIO=m +CONFIG_NET_9P=m +CONFIG_NET_9P_VIRTIO=m +CONFIG_NET_9P_XEN=m +CONFIG_NET_9P_RDMA=m +# CONFIG_NET_9P_DEBUG is not set +CONFIG_CAIF=m +# CONFIG_CAIF_DEBUG is not set +CONFIG_CAIF_NETDEV=m +CONFIG_CAIF_USB=m +CONFIG_CEPH_LIB=m +CONFIG_CEPH_LIB_PRETTYDEBUG=y +CONFIG_CEPH_LIB_USE_DNS_RESOLVER=y +CONFIG_NFC=m +CONFIG_NFC_DIGITAL=m +CONFIG_NFC_NCI=m +CONFIG_NFC_NCI_SPI=m +CONFIG_NFC_NCI_UART=m +CONFIG_NFC_HCI=m +CONFIG_NFC_SHDLC=y + +# +# Near Field Communication (NFC) devices +# +CONFIG_NFC_TRF7970A=m +CONFIG_NFC_MEI_PHY=m +CONFIG_NFC_SIM=m +CONFIG_NFC_PORT100=m +CONFIG_NFC_FDP=m +CONFIG_NFC_FDP_I2C=m +CONFIG_NFC_PN544=m +CONFIG_NFC_PN544_I2C=m +CONFIG_NFC_PN544_MEI=m +CONFIG_NFC_PN533=m +CONFIG_NFC_PN533_USB=m +CONFIG_NFC_PN533_I2C=m +CONFIG_NFC_PN532_UART=m +CONFIG_NFC_MICROREAD=m +CONFIG_NFC_MICROREAD_I2C=m +CONFIG_NFC_MICROREAD_MEI=m +CONFIG_NFC_MRVL=m +CONFIG_NFC_MRVL_USB=m +CONFIG_NFC_MRVL_UART=m +CONFIG_NFC_MRVL_I2C=m +CONFIG_NFC_MRVL_SPI=m +CONFIG_NFC_ST21NFCA=m +CONFIG_NFC_ST21NFCA_I2C=m +CONFIG_NFC_ST_NCI=m +CONFIG_NFC_ST_NCI_I2C=m +CONFIG_NFC_ST_NCI_SPI=m +CONFIG_NFC_NXP_NCI=m +CONFIG_NFC_NXP_NCI_I2C=m +CONFIG_NFC_S3FWRN5=m +CONFIG_NFC_S3FWRN5_I2C=m +CONFIG_NFC_ST95HF=m +# end of Near Field Communication (NFC) devices + +CONFIG_PSAMPLE=m +CONFIG_NET_IFE=m +CONFIG_LWTUNNEL=y +CONFIG_LWTUNNEL_BPF=y +CONFIG_DST_CACHE=y +CONFIG_GRO_CELLS=y +CONFIG_SOCK_VALIDATE_XMIT=y +CONFIG_NET_SOCK_MSG=y +CONFIG_NET_DEVLINK=y +CONFIG_PAGE_POOL=y +CONFIG_FAILOVER=m +CONFIG_HAVE_EBPF_JIT=y + +# +# Device Drivers +# +CONFIG_HAVE_EISA=y +# CONFIG_EISA is not set +CONFIG_HAVE_PCI=y +CONFIG_PCI=y +CONFIG_PCI_DOMAINS=y +CONFIG_PCIEPORTBUS=y +CONFIG_HOTPLUG_PCI_PCIE=y +CONFIG_PCIEAER=y +# CONFIG_PCIEAER_INJECT is not set +CONFIG_PCIE_ECRC=y +CONFIG_PCIEASPM=y +CONFIG_PCIEASPM_DEFAULT=y +# CONFIG_PCIEASPM_POWERSAVE is not set +# CONFIG_PCIEASPM_POWER_SUPERSAVE is not set +# CONFIG_PCIEASPM_PERFORMANCE is not set +CONFIG_PCIE_PME=y +CONFIG_PCIE_DPC=y +CONFIG_PCIE_PTM=y +# CONFIG_PCIE_BW is not set +CONFIG_PCI_MSI=y +CONFIG_PCI_MSI_IRQ_DOMAIN=y +CONFIG_PCI_QUIRKS=y +# CONFIG_PCI_DEBUG is not set +CONFIG_PCI_REALLOC_ENABLE_AUTO=y +CONFIG_PCI_STUB=y +CONFIG_PCI_PF_STUB=m +CONFIG_XEN_PCIDEV_FRONTEND=m +CONFIG_PCI_ATS=y +CONFIG_PCI_ECAM=y +CONFIG_PCI_LOCKLESS_CONFIG=y +CONFIG_PCI_IOV=y +CONFIG_PCI_PRI=y +CONFIG_PCI_PASID=y +CONFIG_PCI_P2PDMA=y +CONFIG_PCI_LABEL=y +CONFIG_PCI_HYPERV=m +CONFIG_HOTPLUG_PCI=y +CONFIG_HOTPLUG_PCI_ACPI=y +CONFIG_HOTPLUG_PCI_ACPI_IBM=m +CONFIG_HOTPLUG_PCI_CPCI=y +CONFIG_HOTPLUG_PCI_CPCI_ZT5550=m +CONFIG_HOTPLUG_PCI_CPCI_GENERIC=m +CONFIG_HOTPLUG_PCI_SHPC=y + +# +# PCI controller drivers +# +CONFIG_PCI_FTPCI100=y +CONFIG_PCI_HOST_COMMON=y +CONFIG_PCI_HOST_GENERIC=y +CONFIG_PCIE_XILINX=y +CONFIG_VMD=m +CONFIG_PCI_HYPERV_INTERFACE=m + +# +# DesignWare PCI Core Support +# +CONFIG_PCIE_DW=y +CONFIG_PCIE_DW_HOST=y +CONFIG_PCIE_DW_EP=y +CONFIG_PCIE_DW_PLAT=y +CONFIG_PCIE_DW_PLAT_HOST=y +CONFIG_PCIE_DW_PLAT_EP=y +CONFIG_PCI_MESON=y +# end of DesignWare PCI Core Support + +# +# Cadence PCIe controllers support +# +CONFIG_PCIE_CADENCE=y +CONFIG_PCIE_CADENCE_HOST=y +CONFIG_PCIE_CADENCE_EP=y +CONFIG_PCIE_CADENCE_PLAT=y +CONFIG_PCIE_CADENCE_PLAT_HOST=y +CONFIG_PCIE_CADENCE_PLAT_EP=y +# end of Cadence PCIe controllers support +# end of PCI controller drivers + +# +# PCI Endpoint +# +CONFIG_PCI_ENDPOINT=y +CONFIG_PCI_ENDPOINT_CONFIGFS=y +# CONFIG_PCI_EPF_TEST is not set +# end of PCI Endpoint + +# +# PCI switch controller drivers +# +CONFIG_PCI_SW_SWITCHTEC=m +# end of PCI switch controller drivers + +CONFIG_PCCARD=m +CONFIG_PCMCIA=m +CONFIG_PCMCIA_LOAD_CIS=y +CONFIG_CARDBUS=y + +# +# PC-card bridges +# +CONFIG_YENTA=m +CONFIG_YENTA_O2=y +CONFIG_YENTA_RICOH=y +CONFIG_YENTA_TI=y +CONFIG_YENTA_ENE_TUNE=y +CONFIG_YENTA_TOSHIBA=y +CONFIG_PD6729=m +CONFIG_I82092=m +CONFIG_PCCARD_NONSTATIC=y +CONFIG_RAPIDIO=m +CONFIG_RAPIDIO_TSI721=m +CONFIG_RAPIDIO_DISC_TIMEOUT=30 +CONFIG_RAPIDIO_ENABLE_RX_TX_PORTS=y +CONFIG_RAPIDIO_DMA_ENGINE=y +# CONFIG_RAPIDIO_DEBUG is not set +CONFIG_RAPIDIO_ENUM_BASIC=m +CONFIG_RAPIDIO_CHMAN=m +CONFIG_RAPIDIO_MPORT_CDEV=m + +# +# RapidIO Switch drivers +# +CONFIG_RAPIDIO_TSI57X=m +CONFIG_RAPIDIO_CPS_XX=m +CONFIG_RAPIDIO_TSI568=m +CONFIG_RAPIDIO_CPS_GEN2=m +CONFIG_RAPIDIO_RXS_GEN3=m +# end of RapidIO Switch drivers + +# +# Generic Driver Options +# +# CONFIG_UEVENT_HELPER is not set +CONFIG_DEVTMPFS=y +CONFIG_DEVTMPFS_MOUNT=y +CONFIG_STANDALONE=y +CONFIG_PREVENT_FIRMWARE_BUILD=y + +# +# Firmware loader +# +CONFIG_FW_LOADER=y +CONFIG_FW_LOADER_PAGED_BUF=y +CONFIG_EXTRA_FIRMWARE="" +# CONFIG_FW_LOADER_USER_HELPER is not set +CONFIG_FW_LOADER_COMPRESS=y +CONFIG_FW_CACHE=y +# end of Firmware loader + +CONFIG_WANT_DEV_COREDUMP=y +CONFIG_ALLOW_DEV_COREDUMP=y +CONFIG_DEV_COREDUMP=y +# CONFIG_DEBUG_DRIVER is not set +# CONFIG_DEBUG_DEVRES is not set +# CONFIG_DEBUG_TEST_DRIVER_REMOVE is not set +CONFIG_HMEM_REPORTING=y +# CONFIG_TEST_ASYNC_DRIVER_PROBE is not set +CONFIG_SYS_HYPERVISOR=y +CONFIG_GENERIC_CPU_AUTOPROBE=y +CONFIG_GENERIC_CPU_VULNERABILITIES=y +CONFIG_REGMAP=y +CONFIG_REGMAP_I2C=y +CONFIG_REGMAP_SLIMBUS=m +CONFIG_REGMAP_SPI=y +CONFIG_REGMAP_SPMI=m +CONFIG_REGMAP_W1=m +CONFIG_REGMAP_MMIO=y +CONFIG_REGMAP_IRQ=y +CONFIG_REGMAP_SCCB=m +CONFIG_REGMAP_I3C=m +CONFIG_DMA_SHARED_BUFFER=y +# CONFIG_DMA_FENCE_TRACE is not set +# end of Generic Driver Options + +# +# Bus devices +# +CONFIG_MOXTET=m +CONFIG_SIMPLE_PM_BUS=y +# end of Bus devices + +CONFIG_CONNECTOR=y +CONFIG_PROC_EVENTS=y +CONFIG_GNSS=m +CONFIG_GNSS_SERIAL=m +CONFIG_GNSS_MTK_SERIAL=m +CONFIG_GNSS_SIRF_SERIAL=m +CONFIG_GNSS_UBX_SERIAL=m +CONFIG_MTD=m +CONFIG_MTD_TESTS=m + +# +# Partition parsers +# +CONFIG_MTD_AR7_PARTS=m +CONFIG_MTD_CMDLINE_PARTS=m +CONFIG_MTD_OF_PARTS=m +CONFIG_MTD_REDBOOT_PARTS=m +CONFIG_MTD_REDBOOT_DIRECTORY_BLOCK=-1 +# CONFIG_MTD_REDBOOT_PARTS_UNALLOCATED is not set +# CONFIG_MTD_REDBOOT_PARTS_READONLY is not set +# end of Partition parsers + +# +# User Modules And Translation Layers +# +CONFIG_MTD_BLKDEVS=m +CONFIG_MTD_BLOCK=m +CONFIG_MTD_BLOCK_RO=m +CONFIG_FTL=m +CONFIG_NFTL=m +CONFIG_NFTL_RW=y +CONFIG_INFTL=m +CONFIG_RFD_FTL=m +CONFIG_SSFDC=m +CONFIG_SM_FTL=m +CONFIG_MTD_OOPS=m +CONFIG_MTD_SWAP=m +CONFIG_MTD_PARTITIONED_MASTER=y + +# +# RAM/ROM/Flash chip drivers +# +CONFIG_MTD_CFI=m +CONFIG_MTD_JEDECPROBE=m +CONFIG_MTD_GEN_PROBE=m +# CONFIG_MTD_CFI_ADV_OPTIONS is not set +CONFIG_MTD_MAP_BANK_WIDTH_1=y +CONFIG_MTD_MAP_BANK_WIDTH_2=y +CONFIG_MTD_MAP_BANK_WIDTH_4=y +CONFIG_MTD_CFI_I1=y +CONFIG_MTD_CFI_I2=y +CONFIG_MTD_CFI_INTELEXT=m +CONFIG_MTD_CFI_AMDSTD=m +CONFIG_MTD_CFI_STAA=m +CONFIG_MTD_CFI_UTIL=m +CONFIG_MTD_RAM=m +CONFIG_MTD_ROM=m +CONFIG_MTD_ABSENT=m +# end of RAM/ROM/Flash chip drivers + +# +# Mapping drivers for chip access +# +CONFIG_MTD_COMPLEX_MAPPINGS=y +CONFIG_MTD_PHYSMAP=m +# CONFIG_MTD_PHYSMAP_COMPAT is not set +CONFIG_MTD_PHYSMAP_OF=y +CONFIG_MTD_PHYSMAP_VERSATILE=y +CONFIG_MTD_PHYSMAP_GEMINI=y +CONFIG_MTD_PHYSMAP_GPIO_ADDR=y +CONFIG_MTD_SBC_GXX=m +CONFIG_MTD_AMD76XROM=m +CONFIG_MTD_ICHXROM=m +CONFIG_MTD_ESB2ROM=m +CONFIG_MTD_CK804XROM=m +CONFIG_MTD_SCB2_FLASH=m +CONFIG_MTD_NETtel=m +CONFIG_MTD_L440GX=m +CONFIG_MTD_PCI=m +CONFIG_MTD_PCMCIA=m +# CONFIG_MTD_PCMCIA_ANONYMOUS is not set +CONFIG_MTD_INTEL_VR_NOR=m +CONFIG_MTD_PLATRAM=m +# end of Mapping drivers for chip access + +# +# Self-contained MTD device drivers +# +CONFIG_MTD_PMC551=m +# CONFIG_MTD_PMC551_BUGFIX is not set +# CONFIG_MTD_PMC551_DEBUG is not set +CONFIG_MTD_DATAFLASH=m +# CONFIG_MTD_DATAFLASH_WRITE_VERIFY is not set +CONFIG_MTD_DATAFLASH_OTP=y +CONFIG_MTD_MCHP23K256=m +CONFIG_MTD_SST25L=m +CONFIG_MTD_SLRAM=m +CONFIG_MTD_PHRAM=m +CONFIG_MTD_MTDRAM=m +CONFIG_MTDRAM_TOTAL_SIZE=4096 +CONFIG_MTDRAM_ERASE_SIZE=128 +CONFIG_MTD_BLOCK2MTD=m + +# +# Disk-On-Chip Device Drivers +# +CONFIG_MTD_DOCG3=m +CONFIG_BCH_CONST_M=14 +CONFIG_BCH_CONST_T=4 +# end of Self-contained MTD device drivers + +CONFIG_MTD_NAND_CORE=m +CONFIG_MTD_ONENAND=m +# CONFIG_MTD_ONENAND_VERIFY_WRITE is not set +CONFIG_MTD_ONENAND_GENERIC=m +CONFIG_MTD_ONENAND_OTP=y +CONFIG_MTD_ONENAND_2X_PROGRAM=y +CONFIG_MTD_NAND_ECC_SW_HAMMING=m +CONFIG_MTD_NAND_ECC_SW_HAMMING_SMC=y +CONFIG_MTD_RAW_NAND=m +CONFIG_MTD_NAND_ECC_SW_BCH=y + +# +# Raw/parallel NAND flash controllers +# +CONFIG_MTD_NAND_DENALI=m +CONFIG_MTD_NAND_DENALI_PCI=m +CONFIG_MTD_NAND_DENALI_DT=m +CONFIG_MTD_NAND_CAFE=m +CONFIG_MTD_NAND_MXIC=m +CONFIG_MTD_NAND_GPIO=m +CONFIG_MTD_NAND_PLATFORM=m +CONFIG_MTD_NAND_CADENCE=m + +# +# Misc +# +CONFIG_MTD_SM_COMMON=m +CONFIG_MTD_NAND_NANDSIM=m +CONFIG_MTD_NAND_RICOH=m +CONFIG_MTD_NAND_DISKONCHIP=m +# CONFIG_MTD_NAND_DISKONCHIP_PROBE_ADVANCED is not set +CONFIG_MTD_NAND_DISKONCHIP_PROBE_ADDRESS=0 +CONFIG_MTD_NAND_DISKONCHIP_BBTWRITE=y +CONFIG_MTD_SPI_NAND=m + +# +# LPDDR & LPDDR2 PCM memory drivers +# +CONFIG_MTD_LPDDR=m +CONFIG_MTD_QINFO_PROBE=m +# end of LPDDR & LPDDR2 PCM memory drivers + +CONFIG_MTD_SPI_NOR=m +CONFIG_MTD_SPI_NOR_USE_4K_SECTORS=y +CONFIG_SPI_MTK_QUADSPI=m +CONFIG_SPI_INTEL_SPI=m +CONFIG_SPI_INTEL_SPI_PCI=m +CONFIG_SPI_INTEL_SPI_PLATFORM=m +CONFIG_MTD_UBI=m +CONFIG_MTD_UBI_WL_THRESHOLD=4096 +CONFIG_MTD_UBI_BEB_LIMIT=20 +CONFIG_MTD_UBI_FASTMAP=y +CONFIG_MTD_UBI_GLUEBI=m +CONFIG_MTD_UBI_BLOCK=y +CONFIG_MTD_HYPERBUS=m +CONFIG_DTC=y +CONFIG_OF=y +# CONFIG_OF_UNITTEST is not set +CONFIG_OF_FLATTREE=y +CONFIG_OF_KOBJ=y +CONFIG_OF_DYNAMIC=y +CONFIG_OF_ADDRESS=y +CONFIG_OF_IRQ=y +CONFIG_OF_NET=y +CONFIG_OF_MDIO=m +CONFIG_OF_RESOLVE=y +CONFIG_OF_OVERLAY=y +CONFIG_ARCH_MIGHT_HAVE_PC_PARPORT=y +CONFIG_PARPORT=m +CONFIG_PARPORT_PC=m +CONFIG_PARPORT_SERIAL=m +CONFIG_PARPORT_PC_FIFO=y +CONFIG_PARPORT_PC_SUPERIO=y +CONFIG_PARPORT_PC_PCMCIA=m +CONFIG_PARPORT_AX88796=m +CONFIG_PARPORT_1284=y +CONFIG_PARPORT_NOT_PC=y +CONFIG_PNP=y +CONFIG_PNP_DEBUG_MESSAGES=y + +# +# Protocols +# +CONFIG_PNPACPI=y +CONFIG_BLK_DEV=y +# CONFIG_BLK_DEV_NULL_BLK is not set +CONFIG_BLK_DEV_FD=m +CONFIG_CDROM=m +# CONFIG_PARIDE is not set +CONFIG_BLK_DEV_PCIESSD_MTIP32XX=m +CONFIG_ZRAM=m +CONFIG_ZRAM_WRITEBACK=y +# CONFIG_ZRAM_MEMORY_TRACKING is not set +CONFIG_BLK_DEV_UMEM=m +CONFIG_BLK_DEV_LOOP=m +CONFIG_BLK_DEV_LOOP_MIN_COUNT=8 +CONFIG_BLK_DEV_CRYPTOLOOP=m +CONFIG_BLK_DEV_DRBD=m +# CONFIG_DRBD_FAULT_INJECTION is not set +CONFIG_BLK_DEV_NBD=m +CONFIG_BLK_DEV_SKD=m +CONFIG_BLK_DEV_SX8=m +CONFIG_BLK_DEV_RAM=m +CONFIG_BLK_DEV_RAM_COUNT=16 +CONFIG_BLK_DEV_RAM_SIZE=16384 +CONFIG_CDROM_PKTCDVD=m +CONFIG_CDROM_PKTCDVD_BUFFERS=8 +# CONFIG_CDROM_PKTCDVD_WCACHE is not set +CONFIG_ATA_OVER_ETH=m +CONFIG_XEN_BLKDEV_FRONTEND=m +CONFIG_XEN_BLKDEV_BACKEND=m +CONFIG_VIRTIO_BLK=m +# CONFIG_VIRTIO_BLK_SCSI is not set +CONFIG_BLK_DEV_RBD=m +CONFIG_BLK_DEV_RSXX=m + +# +# NVME Support +# +CONFIG_NVME_CORE=y +CONFIG_BLK_DEV_NVME=y +CONFIG_NVME_MULTIPATH=y +CONFIG_NVME_HWMON=y +CONFIG_NVME_FABRICS=m +CONFIG_NVME_RDMA=m +CONFIG_NVME_FC=m +CONFIG_NVME_TCP=m +CONFIG_NVME_TARGET=m +CONFIG_NVME_TARGET_LOOP=m +CONFIG_NVME_TARGET_RDMA=m +CONFIG_NVME_TARGET_FC=m +CONFIG_NVME_TARGET_FCLOOP=m +CONFIG_NVME_TARGET_TCP=m +# end of NVME Support + +# +# Misc devices +# +CONFIG_SENSORS_LIS3LV02D=m +CONFIG_AD525X_DPOT=m +CONFIG_AD525X_DPOT_I2C=m +CONFIG_AD525X_DPOT_SPI=m +# CONFIG_DUMMY_IRQ is not set +CONFIG_IBM_ASM=m +CONFIG_PHANTOM=m +CONFIG_TIFM_CORE=m +CONFIG_TIFM_7XX1=m +CONFIG_ICS932S401=m +CONFIG_ENCLOSURE_SERVICES=m +CONFIG_HP_ILO=m +CONFIG_APDS9802ALS=m +CONFIG_ISL29003=m +CONFIG_ISL29020=m +CONFIG_SENSORS_TSL2550=m +CONFIG_SENSORS_BH1770=m +CONFIG_SENSORS_APDS990X=m +CONFIG_HMC6352=m +CONFIG_DS1682=m +CONFIG_VMWARE_BALLOON=m +CONFIG_LATTICE_ECP3_CONFIG=m +# CONFIG_SRAM is not set +CONFIG_PCI_ENDPOINT_TEST=m +CONFIG_XILINX_SDFEC=m +CONFIG_MISC_RTSX=m +CONFIG_PVPANIC=m +CONFIG_C2PORT=m +CONFIG_C2PORT_DURAMAR_2150=m + +# +# EEPROM support +# +CONFIG_EEPROM_AT24=m +# CONFIG_EEPROM_AT25 is not set +CONFIG_EEPROM_LEGACY=m +CONFIG_EEPROM_MAX6875=m +CONFIG_EEPROM_93CX6=m +# CONFIG_EEPROM_93XX46 is not set +CONFIG_EEPROM_IDT_89HPESX=m +CONFIG_EEPROM_EE1004=m +# end of EEPROM support + +CONFIG_CB710_CORE=m +# CONFIG_CB710_DEBUG is not set +CONFIG_CB710_DEBUG_ASSUMPTIONS=y + +# +# Texas Instruments shared transport line discipline +# +CONFIG_TI_ST=m +# end of Texas Instruments shared transport line discipline + +CONFIG_SENSORS_LIS3_I2C=m +CONFIG_ALTERA_STAPL=m +CONFIG_INTEL_MEI=m +CONFIG_INTEL_MEI_ME=m +CONFIG_INTEL_MEI_TXE=m +CONFIG_INTEL_MEI_HDCP=m +CONFIG_VMWARE_VMCI=m + +# +# Intel MIC & related support +# +CONFIG_INTEL_MIC_BUS=m +CONFIG_SCIF_BUS=m +CONFIG_VOP_BUS=m +CONFIG_INTEL_MIC_HOST=m +CONFIG_INTEL_MIC_CARD=m +CONFIG_SCIF=m +CONFIG_MIC_COSM=m +CONFIG_VOP=m +CONFIG_VHOST_RING=m +# end of Intel MIC & related support + +CONFIG_GENWQE=m +CONFIG_GENWQE_PLATFORM_ERROR_RECOVERY=0 +CONFIG_ECHO=m +CONFIG_MISC_ALCOR_PCI=m +CONFIG_MISC_RTSX_PCI=m +CONFIG_MISC_RTSX_USB=m +CONFIG_HABANA_AI=m +# end of Misc devices + +CONFIG_HAVE_IDE=y +# CONFIG_IDE is not set + +# +# SCSI device support +# +CONFIG_SCSI_MOD=m +CONFIG_RAID_ATTRS=m +CONFIG_SCSI=m +CONFIG_SCSI_DMA=y +CONFIG_SCSI_NETLINK=y +CONFIG_SCSI_PROC_FS=y + +# +# SCSI support type (disk, tape, CD-ROM) +# +CONFIG_BLK_DEV_SD=m +CONFIG_CHR_DEV_ST=m +CONFIG_BLK_DEV_SR=m +CONFIG_BLK_DEV_SR_VENDOR=y +CONFIG_CHR_DEV_SG=m +CONFIG_CHR_DEV_SCH=m +CONFIG_SCSI_ENCLOSURE=m +CONFIG_SCSI_CONSTANTS=y +CONFIG_SCSI_LOGGING=y +CONFIG_SCSI_SCAN_ASYNC=y + +# +# SCSI Transports +# +CONFIG_SCSI_SPI_ATTRS=m +CONFIG_SCSI_FC_ATTRS=m +CONFIG_SCSI_ISCSI_ATTRS=m +CONFIG_SCSI_SAS_ATTRS=m +CONFIG_SCSI_SAS_LIBSAS=m +CONFIG_SCSI_SAS_ATA=y +CONFIG_SCSI_SAS_HOST_SMP=y +CONFIG_SCSI_SRP_ATTRS=m +# end of SCSI Transports + +CONFIG_SCSI_LOWLEVEL=y +CONFIG_ISCSI_TCP=m +CONFIG_ISCSI_BOOT_SYSFS=m +CONFIG_SCSI_CXGB3_ISCSI=m +CONFIG_SCSI_CXGB4_ISCSI=m +CONFIG_SCSI_BNX2_ISCSI=m +CONFIG_SCSI_BNX2X_FCOE=m +CONFIG_BE2ISCSI=m +CONFIG_BLK_DEV_3W_XXXX_RAID=m +CONFIG_SCSI_HPSA=m +CONFIG_SCSI_3W_9XXX=m +CONFIG_SCSI_3W_SAS=m +CONFIG_SCSI_ACARD=m +CONFIG_SCSI_AACRAID=m +CONFIG_SCSI_AIC7XXX=m +CONFIG_AIC7XXX_CMDS_PER_DEVICE=32 +CONFIG_AIC7XXX_RESET_DELAY_MS=15000 +CONFIG_AIC7XXX_DEBUG_ENABLE=y +CONFIG_AIC7XXX_DEBUG_MASK=0 +CONFIG_AIC7XXX_REG_PRETTY_PRINT=y +CONFIG_SCSI_AIC79XX=m +CONFIG_AIC79XX_CMDS_PER_DEVICE=32 +CONFIG_AIC79XX_RESET_DELAY_MS=15000 +CONFIG_AIC79XX_DEBUG_ENABLE=y +CONFIG_AIC79XX_DEBUG_MASK=0 +CONFIG_AIC79XX_REG_PRETTY_PRINT=y +CONFIG_SCSI_AIC94XX=m +CONFIG_AIC94XX_DEBUG=y +CONFIG_SCSI_MVSAS=m +CONFIG_SCSI_MVSAS_DEBUG=y +CONFIG_SCSI_MVSAS_TASKLET=y +CONFIG_SCSI_MVUMI=m +CONFIG_SCSI_DPT_I2O=m +CONFIG_SCSI_ADVANSYS=m +CONFIG_SCSI_ARCMSR=m +CONFIG_SCSI_ESAS2R=m +CONFIG_MEGARAID_NEWGEN=y +CONFIG_MEGARAID_MM=m +CONFIG_MEGARAID_MAILBOX=m +CONFIG_MEGARAID_LEGACY=m +CONFIG_MEGARAID_SAS=m +CONFIG_SCSI_MPT3SAS=m +CONFIG_SCSI_MPT2SAS_MAX_SGE=128 +CONFIG_SCSI_MPT3SAS_MAX_SGE=128 +CONFIG_SCSI_MPT2SAS=m +CONFIG_SCSI_SMARTPQI=m +CONFIG_SCSI_UFSHCD=m +CONFIG_SCSI_UFSHCD_PCI=m +# CONFIG_SCSI_UFS_DWC_TC_PCI is not set +CONFIG_SCSI_UFSHCD_PLATFORM=m +CONFIG_SCSI_UFS_CDNS_PLATFORM=m +# CONFIG_SCSI_UFS_DWC_TC_PLATFORM is not set +CONFIG_SCSI_UFS_BSG=y +CONFIG_SCSI_HPTIOP=m +CONFIG_SCSI_BUSLOGIC=m +CONFIG_SCSI_FLASHPOINT=y +CONFIG_SCSI_MYRB=m +CONFIG_SCSI_MYRS=m +CONFIG_VMWARE_PVSCSI=m +CONFIG_XEN_SCSI_FRONTEND=m +CONFIG_HYPERV_STORAGE=m +CONFIG_LIBFC=m +CONFIG_LIBFCOE=m +CONFIG_FCOE=m +CONFIG_FCOE_FNIC=m +CONFIG_SCSI_SNIC=m +# CONFIG_SCSI_SNIC_DEBUG_FS is not set +CONFIG_SCSI_DMX3191D=m +CONFIG_SCSI_FDOMAIN=m +CONFIG_SCSI_FDOMAIN_PCI=m +CONFIG_SCSI_GDTH=m +CONFIG_SCSI_ISCI=m +CONFIG_SCSI_IPS=m +CONFIG_SCSI_INITIO=m +CONFIG_SCSI_INIA100=m +CONFIG_SCSI_PPA=m +CONFIG_SCSI_IMM=m +# CONFIG_SCSI_IZIP_EPP16 is not set +# CONFIG_SCSI_IZIP_SLOW_CTR is not set +CONFIG_SCSI_STEX=m +CONFIG_SCSI_SYM53C8XX_2=m +CONFIG_SCSI_SYM53C8XX_DMA_ADDRESSING_MODE=1 +CONFIG_SCSI_SYM53C8XX_DEFAULT_TAGS=16 +CONFIG_SCSI_SYM53C8XX_MAX_TAGS=64 +CONFIG_SCSI_SYM53C8XX_MMIO=y +CONFIG_SCSI_IPR=m +CONFIG_SCSI_IPR_TRACE=y +CONFIG_SCSI_IPR_DUMP=y +CONFIG_SCSI_QLOGIC_1280=m +CONFIG_SCSI_QLA_FC=m +CONFIG_TCM_QLA2XXX=m +# CONFIG_TCM_QLA2XXX_DEBUG is not set +CONFIG_SCSI_QLA_ISCSI=m +CONFIG_QEDI=m +CONFIG_QEDF=m +CONFIG_SCSI_LPFC=m +# CONFIG_SCSI_LPFC_DEBUG_FS is not set +CONFIG_SCSI_DC395x=m +CONFIG_SCSI_AM53C974=m +CONFIG_SCSI_WD719X=m +CONFIG_SCSI_DEBUG=m +CONFIG_SCSI_PMCRAID=m +CONFIG_SCSI_PM8001=m +CONFIG_SCSI_BFA_FC=m +CONFIG_SCSI_VIRTIO=m +CONFIG_SCSI_CHELSIO_FCOE=m +CONFIG_SCSI_LOWLEVEL_PCMCIA=y +CONFIG_PCMCIA_AHA152X=m +CONFIG_PCMCIA_FDOMAIN=m +CONFIG_PCMCIA_QLOGIC=m +CONFIG_PCMCIA_SYM53C500=m +CONFIG_SCSI_DH=y +CONFIG_SCSI_DH_RDAC=m +CONFIG_SCSI_DH_HP_SW=m +CONFIG_SCSI_DH_EMC=m +CONFIG_SCSI_DH_ALUA=m +# end of SCSI device support + +CONFIG_ATA=m +CONFIG_ATA_VERBOSE_ERROR=y +CONFIG_ATA_ACPI=y +CONFIG_SATA_ZPODD=y +CONFIG_SATA_PMP=y + +# +# Controllers with non-SFF native interface +# +CONFIG_SATA_AHCI=m +CONFIG_SATA_MOBILE_LPM_POLICY=3 +CONFIG_SATA_AHCI_PLATFORM=m +CONFIG_AHCI_CEVA=m +CONFIG_AHCI_QORIQ=m +CONFIG_SATA_INIC162X=m +CONFIG_SATA_ACARD_AHCI=m +CONFIG_SATA_SIL24=m +CONFIG_ATA_SFF=y + +# +# SFF controllers with custom DMA interface +# +CONFIG_PDC_ADMA=m +CONFIG_SATA_QSTOR=m +CONFIG_SATA_SX4=m +CONFIG_ATA_BMDMA=y + +# +# SATA SFF controllers with BMDMA +# +CONFIG_ATA_PIIX=m +CONFIG_SATA_DWC=m +# CONFIG_SATA_DWC_OLD_DMA is not set +# CONFIG_SATA_DWC_DEBUG is not set +CONFIG_SATA_MV=m +CONFIG_SATA_NV=m +CONFIG_SATA_PROMISE=m +CONFIG_SATA_SIL=m +CONFIG_SATA_SIS=m +CONFIG_SATA_SVW=m +CONFIG_SATA_ULI=m +CONFIG_SATA_VIA=m +CONFIG_SATA_VITESSE=m + +# +# PATA SFF controllers with BMDMA +# +CONFIG_PATA_ALI=m +CONFIG_PATA_AMD=m +CONFIG_PATA_ARTOP=m +CONFIG_PATA_ATIIXP=m +CONFIG_PATA_ATP867X=m +CONFIG_PATA_CMD64X=m +CONFIG_PATA_CYPRESS=m +CONFIG_PATA_EFAR=m +CONFIG_PATA_HPT366=m +CONFIG_PATA_HPT37X=m +CONFIG_PATA_HPT3X2N=m +CONFIG_PATA_HPT3X3=m +CONFIG_PATA_HPT3X3_DMA=y +CONFIG_PATA_IT8213=m +CONFIG_PATA_IT821X=m +CONFIG_PATA_JMICRON=m +CONFIG_PATA_MARVELL=m +CONFIG_PATA_NETCELL=m +CONFIG_PATA_NINJA32=m +CONFIG_PATA_NS87415=m +CONFIG_PATA_OLDPIIX=m +CONFIG_PATA_OPTIDMA=m +CONFIG_PATA_PDC2027X=m +CONFIG_PATA_PDC_OLD=m +CONFIG_PATA_RADISYS=m +CONFIG_PATA_RDC=m +CONFIG_PATA_SCH=m +CONFIG_PATA_SERVERWORKS=m +CONFIG_PATA_SIL680=m +CONFIG_PATA_SIS=m +CONFIG_PATA_TOSHIBA=m +CONFIG_PATA_TRIFLEX=m +CONFIG_PATA_VIA=m +CONFIG_PATA_WINBOND=m + +# +# PIO-only SFF controllers +# +CONFIG_PATA_CMD640_PCI=m +CONFIG_PATA_MPIIX=m +CONFIG_PATA_NS87410=m +CONFIG_PATA_OPTI=m +CONFIG_PATA_PCMCIA=m +# CONFIG_PATA_PLATFORM is not set +CONFIG_PATA_RZ1000=m + +# +# Generic fallback / legacy drivers +# +CONFIG_PATA_ACPI=m +CONFIG_ATA_GENERIC=m +CONFIG_PATA_LEGACY=m +CONFIG_MD=y +CONFIG_BLK_DEV_MD=m +CONFIG_MD_LINEAR=m +CONFIG_MD_RAID0=m +CONFIG_MD_RAID1=m +CONFIG_MD_RAID10=m +CONFIG_MD_RAID456=m +CONFIG_MD_MULTIPATH=m +CONFIG_MD_FAULTY=m +CONFIG_MD_CLUSTER=m +CONFIG_BCACHE=m +# CONFIG_BCACHE_DEBUG is not set +# CONFIG_BCACHE_CLOSURES_DEBUG is not set +CONFIG_BLK_DEV_DM_BUILTIN=y +CONFIG_BLK_DEV_DM=m +CONFIG_DM_DEBUG=y +CONFIG_DM_BUFIO=m +# CONFIG_DM_DEBUG_BLOCK_MANAGER_LOCKING is not set +CONFIG_DM_BIO_PRISON=m +CONFIG_DM_PERSISTENT_DATA=m +CONFIG_DM_UNSTRIPED=m +CONFIG_DM_CRYPT=m +CONFIG_DM_SNAPSHOT=m +CONFIG_DM_THIN_PROVISIONING=m +CONFIG_DM_CACHE=m +CONFIG_DM_CACHE_SMQ=m +CONFIG_DM_WRITECACHE=m +CONFIG_DM_ERA=m +CONFIG_DM_CLONE=m +CONFIG_DM_MIRROR=m +CONFIG_DM_LOG_USERSPACE=m +CONFIG_DM_RAID=m +CONFIG_DM_ZERO=m +CONFIG_DM_MULTIPATH=m +CONFIG_DM_MULTIPATH_QL=m +CONFIG_DM_MULTIPATH_ST=m +CONFIG_DM_DELAY=m +CONFIG_DM_DUST=m +CONFIG_DM_UEVENT=y +CONFIG_DM_FLAKEY=m +CONFIG_DM_VERITY=m +CONFIG_DM_VERITY_VERIFY_ROOTHASH_SIG=y +CONFIG_DM_VERITY_FEC=y +CONFIG_DM_SWITCH=m +CONFIG_DM_LOG_WRITES=m +CONFIG_DM_INTEGRITY=m +CONFIG_DM_ZONED=m +CONFIG_TARGET_CORE=m +CONFIG_TCM_IBLOCK=m +CONFIG_TCM_FILEIO=m +CONFIG_TCM_PSCSI=m +CONFIG_TCM_USER2=m +CONFIG_LOOPBACK_TARGET=m +CONFIG_TCM_FC=m +CONFIG_ISCSI_TARGET=m +CONFIG_ISCSI_TARGET_CXGB4=m +CONFIG_SBP_TARGET=m +CONFIG_FUSION=y +CONFIG_FUSION_SPI=m +CONFIG_FUSION_FC=m +CONFIG_FUSION_SAS=m +CONFIG_FUSION_MAX_SGE=128 +CONFIG_FUSION_CTL=m +CONFIG_FUSION_LAN=m +# CONFIG_FUSION_LOGGING is not set + +# +# IEEE 1394 (FireWire) support +# +CONFIG_FIREWIRE=m +CONFIG_FIREWIRE_OHCI=m +CONFIG_FIREWIRE_SBP2=m +CONFIG_FIREWIRE_NET=m +CONFIG_FIREWIRE_NOSY=m +# end of IEEE 1394 (FireWire) support + +CONFIG_MACINTOSH_DRIVERS=y +CONFIG_MAC_EMUMOUSEBTN=m +CONFIG_NETDEVICES=y +CONFIG_MII=m +CONFIG_NET_CORE=y +CONFIG_BONDING=m +CONFIG_DUMMY=m +CONFIG_EQUALIZER=m +CONFIG_NET_FC=y +CONFIG_IFB=m +CONFIG_NET_TEAM=m +CONFIG_NET_TEAM_MODE_BROADCAST=m +CONFIG_NET_TEAM_MODE_ROUNDROBIN=m +CONFIG_NET_TEAM_MODE_RANDOM=m +CONFIG_NET_TEAM_MODE_ACTIVEBACKUP=m +CONFIG_NET_TEAM_MODE_LOADBALANCE=m +CONFIG_MACVLAN=m +CONFIG_MACVTAP=m +CONFIG_IPVLAN_L3S=y +CONFIG_IPVLAN=m +CONFIG_IPVTAP=m +CONFIG_VXLAN=m +CONFIG_GENEVE=m +CONFIG_GTP=m +CONFIG_MACSEC=m +CONFIG_NETCONSOLE=m +CONFIG_NETCONSOLE_DYNAMIC=y +CONFIG_NETPOLL=y +CONFIG_NET_POLL_CONTROLLER=y +CONFIG_NTB_NETDEV=m +CONFIG_RIONET=m +CONFIG_RIONET_TX_SIZE=128 +CONFIG_RIONET_RX_SIZE=128 +CONFIG_TUN=m +CONFIG_TAP=m +# CONFIG_TUN_VNET_CROSS_LE is not set +CONFIG_VETH=m +CONFIG_VIRTIO_NET=m +CONFIG_NLMON=m +CONFIG_NET_VRF=m +CONFIG_VSOCKMON=m +CONFIG_SUNGEM_PHY=m +# CONFIG_ARCNET is not set +CONFIG_ATM_DRIVERS=y +# CONFIG_ATM_DUMMY is not set +CONFIG_ATM_TCP=m +CONFIG_ATM_LANAI=m +CONFIG_ATM_ENI=m +# CONFIG_ATM_ENI_DEBUG is not set +# CONFIG_ATM_ENI_TUNE_BURST is not set +CONFIG_ATM_FIRESTREAM=m +CONFIG_ATM_ZATM=m +# CONFIG_ATM_ZATM_DEBUG is not set +CONFIG_ATM_NICSTAR=m +# CONFIG_ATM_NICSTAR_USE_SUNI is not set +# CONFIG_ATM_NICSTAR_USE_IDT77105 is not set +CONFIG_ATM_IDT77252=m +# CONFIG_ATM_IDT77252_DEBUG is not set +# CONFIG_ATM_IDT77252_RCV_ALL is not set +CONFIG_ATM_IDT77252_USE_SUNI=y +CONFIG_ATM_AMBASSADOR=m +# CONFIG_ATM_AMBASSADOR_DEBUG is not set +CONFIG_ATM_HORIZON=m +# CONFIG_ATM_HORIZON_DEBUG is not set +CONFIG_ATM_IA=m +# CONFIG_ATM_IA_DEBUG is not set +CONFIG_ATM_FORE200E=m +CONFIG_ATM_FORE200E_USE_TASKLET=y +CONFIG_ATM_FORE200E_TX_RETRY=16 +CONFIG_ATM_FORE200E_DEBUG=0 +CONFIG_ATM_HE=m +CONFIG_ATM_HE_USE_SUNI=y +CONFIG_ATM_SOLOS=m +CONFIG_CAIF_DRIVERS=y +CONFIG_CAIF_TTY=m +CONFIG_CAIF_SPI_SLAVE=m +CONFIG_CAIF_SPI_SYNC=y +CONFIG_CAIF_HSI=m +CONFIG_CAIF_VIRTIO=m + +# +# Distributed Switch Architecture drivers +# +CONFIG_B53=m +# CONFIG_B53_SPI_DRIVER is not set +CONFIG_B53_MDIO_DRIVER=m +CONFIG_B53_MMAP_DRIVER=m +CONFIG_B53_SRAB_DRIVER=m +CONFIG_B53_SERDES=m +CONFIG_NET_DSA_BCM_SF2=m +CONFIG_NET_DSA_LOOP=m +CONFIG_NET_DSA_LANTIQ_GSWIP=m +CONFIG_NET_DSA_MT7530=m +CONFIG_NET_DSA_MV88E6060=m +CONFIG_NET_DSA_MICROCHIP_KSZ_COMMON=m +CONFIG_NET_DSA_MICROCHIP_KSZ9477=m +CONFIG_NET_DSA_MICROCHIP_KSZ9477_I2C=m +CONFIG_NET_DSA_MICROCHIP_KSZ9477_SPI=m +CONFIG_NET_DSA_MICROCHIP_KSZ8795=m +CONFIG_NET_DSA_MICROCHIP_KSZ8795_SPI=m +CONFIG_NET_DSA_MV88E6XXX=m +CONFIG_NET_DSA_MV88E6XXX_GLOBAL2=y +CONFIG_NET_DSA_MV88E6XXX_PTP=y +CONFIG_NET_DSA_MSCC_FELIX=m +CONFIG_NET_DSA_SJA1105=m +CONFIG_NET_DSA_SJA1105_PTP=y +CONFIG_NET_DSA_SJA1105_TAS=y +CONFIG_NET_DSA_QCA8K=m +CONFIG_NET_DSA_REALTEK_SMI=m +CONFIG_NET_DSA_SMSC_LAN9303=m +CONFIG_NET_DSA_SMSC_LAN9303_I2C=m +CONFIG_NET_DSA_SMSC_LAN9303_MDIO=m +CONFIG_NET_DSA_VITESSE_VSC73XX=m +CONFIG_NET_DSA_VITESSE_VSC73XX_SPI=m +CONFIG_NET_DSA_VITESSE_VSC73XX_PLATFORM=m +# end of Distributed Switch Architecture drivers + +CONFIG_ETHERNET=y +CONFIG_MDIO=m +CONFIG_NET_VENDOR_3COM=y +CONFIG_PCMCIA_3C574=m +CONFIG_PCMCIA_3C589=m +CONFIG_VORTEX=m +CONFIG_TYPHOON=m +CONFIG_NET_VENDOR_ADAPTEC=y +CONFIG_ADAPTEC_STARFIRE=m +CONFIG_NET_VENDOR_AGERE=y +CONFIG_ET131X=m +CONFIG_NET_VENDOR_ALACRITECH=y +CONFIG_SLICOSS=m +CONFIG_NET_VENDOR_ALTEON=y +CONFIG_ACENIC=m +# CONFIG_ACENIC_OMIT_TIGON_I is not set +CONFIG_ALTERA_TSE=m +CONFIG_NET_VENDOR_AMAZON=y +CONFIG_ENA_ETHERNET=m +CONFIG_NET_VENDOR_AMD=y +CONFIG_AMD8111_ETH=m +CONFIG_PCNET32=m +CONFIG_PCMCIA_NMCLAN=m +CONFIG_AMD_XGBE=m +CONFIG_AMD_XGBE_DCB=y +CONFIG_AMD_XGBE_HAVE_ECC=y +CONFIG_NET_VENDOR_AQUANTIA=y +CONFIG_AQTION=m +CONFIG_NET_VENDOR_ARC=y +CONFIG_NET_VENDOR_ATHEROS=y +CONFIG_ATL2=m +CONFIG_ATL1=m +CONFIG_ATL1E=m +CONFIG_ATL1C=m +CONFIG_ALX=m +CONFIG_NET_VENDOR_AURORA=y +CONFIG_AURORA_NB8800=m +CONFIG_NET_VENDOR_BROADCOM=y +CONFIG_B44=m +CONFIG_B44_PCI_AUTOSELECT=y +CONFIG_B44_PCICORE_AUTOSELECT=y +CONFIG_B44_PCI=y +CONFIG_BCMGENET=m +CONFIG_BNX2=m +CONFIG_CNIC=m +CONFIG_TIGON3=m +CONFIG_TIGON3_HWMON=y +CONFIG_BNX2X=m +CONFIG_BNX2X_SRIOV=y +CONFIG_SYSTEMPORT=m +CONFIG_BNXT=m +CONFIG_BNXT_SRIOV=y +CONFIG_BNXT_FLOWER_OFFLOAD=y +CONFIG_BNXT_DCB=y +CONFIG_BNXT_HWMON=y +CONFIG_NET_VENDOR_BROCADE=y +CONFIG_BNA=m +CONFIG_NET_VENDOR_CADENCE=y +CONFIG_MACB=m +CONFIG_MACB_USE_HWSTAMP=y +CONFIG_MACB_PCI=m +CONFIG_NET_VENDOR_CAVIUM=y +CONFIG_THUNDER_NIC_PF=m +CONFIG_THUNDER_NIC_VF=m +CONFIG_THUNDER_NIC_BGX=m +CONFIG_THUNDER_NIC_RGX=m +CONFIG_CAVIUM_PTP=m +CONFIG_LIQUIDIO=m +CONFIG_LIQUIDIO_VF=m +CONFIG_NET_VENDOR_CHELSIO=y +CONFIG_CHELSIO_T1=m +CONFIG_CHELSIO_T1_1G=y +CONFIG_CHELSIO_T3=m +CONFIG_CHELSIO_T4=m +CONFIG_CHELSIO_T4_DCB=y +CONFIG_CHELSIO_T4_FCOE=y +CONFIG_CHELSIO_T4VF=m +CONFIG_CHELSIO_LIB=m +CONFIG_NET_VENDOR_CISCO=y +CONFIG_ENIC=m +CONFIG_NET_VENDOR_CORTINA=y +CONFIG_GEMINI_ETHERNET=m +CONFIG_CX_ECAT=m +CONFIG_DNET=m +CONFIG_NET_VENDOR_DEC=y +CONFIG_NET_TULIP=y +CONFIG_DE2104X=m +CONFIG_DE2104X_DSL=0 +CONFIG_TULIP=m +CONFIG_TULIP_MWI=y +CONFIG_TULIP_MMIO=y +CONFIG_TULIP_NAPI=y +CONFIG_TULIP_NAPI_HW_MITIGATION=y +CONFIG_DE4X5=m +CONFIG_WINBOND_840=m +CONFIG_DM9102=m +CONFIG_ULI526X=m +CONFIG_PCMCIA_XIRCOM=m +CONFIG_NET_VENDOR_DLINK=y +CONFIG_DL2K=m +CONFIG_SUNDANCE=m +# CONFIG_SUNDANCE_MMIO is not set +CONFIG_NET_VENDOR_EMULEX=y +CONFIG_BE2NET=m +CONFIG_BE2NET_HWMON=y +CONFIG_BE2NET_BE2=y +CONFIG_BE2NET_BE3=y +CONFIG_BE2NET_LANCER=y +CONFIG_BE2NET_SKYHAWK=y +CONFIG_NET_VENDOR_EZCHIP=y +CONFIG_EZCHIP_NPS_MANAGEMENT_ENET=m +CONFIG_NET_VENDOR_FUJITSU=y +CONFIG_PCMCIA_FMVJ18X=m +CONFIG_NET_VENDOR_GOOGLE=y +CONFIG_GVE=m +CONFIG_NET_VENDOR_HUAWEI=y +CONFIG_HINIC=m +CONFIG_NET_VENDOR_I825XX=y +CONFIG_NET_VENDOR_INTEL=y +CONFIG_E100=m +CONFIG_E1000=m +CONFIG_E1000E=m +CONFIG_E1000E_HWTS=y +CONFIG_IGB=m +CONFIG_IGB_HWMON=y +CONFIG_IGB_DCA=y +CONFIG_IGBVF=m +CONFIG_IXGB=m +CONFIG_IXGBE=m +CONFIG_IXGBE_HWMON=y +CONFIG_IXGBE_DCA=y +CONFIG_IXGBE_DCB=y +# CONFIG_IXGBE_IPSEC is not set +CONFIG_IXGBEVF=m +CONFIG_IXGBEVF_IPSEC=y +CONFIG_I40E=m +CONFIG_I40E_DCB=y +CONFIG_IAVF=m +CONFIG_I40EVF=m +CONFIG_ICE=m +CONFIG_FM10K=m +CONFIG_IGC=m +CONFIG_JME=m +CONFIG_NET_VENDOR_MARVELL=y +CONFIG_MVMDIO=m +CONFIG_SKGE=m +# CONFIG_SKGE_DEBUG is not set +CONFIG_SKGE_GENESIS=y +CONFIG_SKY2=m +# CONFIG_SKY2_DEBUG is not set +CONFIG_NET_VENDOR_MELLANOX=y +CONFIG_MLX4_EN=m +CONFIG_MLX4_EN_DCB=y +CONFIG_MLX4_CORE=m +CONFIG_MLX4_DEBUG=y +CONFIG_MLX4_CORE_GEN2=y +CONFIG_MLX5_CORE=m +CONFIG_MLX5_ACCEL=y +CONFIG_MLX5_FPGA=y +CONFIG_MLX5_CORE_EN=y +CONFIG_MLX5_EN_ARFS=y +CONFIG_MLX5_EN_RXNFC=y +CONFIG_MLX5_MPFS=y +CONFIG_MLX5_ESWITCH=y +CONFIG_MLX5_CORE_EN_DCB=y +CONFIG_MLX5_CORE_IPOIB=y +CONFIG_MLX5_FPGA_IPSEC=y +CONFIG_MLX5_EN_IPSEC=y +CONFIG_MLX5_FPGA_TLS=y +CONFIG_MLX5_TLS=y +CONFIG_MLX5_EN_TLS=y +CONFIG_MLX5_SW_STEERING=y +CONFIG_MLXSW_CORE=m +CONFIG_MLXSW_CORE_HWMON=y +CONFIG_MLXSW_CORE_THERMAL=y +CONFIG_MLXSW_PCI=m +CONFIG_MLXSW_I2C=m +CONFIG_MLXSW_SWITCHIB=m +CONFIG_MLXSW_SWITCHX2=m +CONFIG_MLXSW_SPECTRUM=m +CONFIG_MLXSW_SPECTRUM_DCB=y +CONFIG_MLXSW_MINIMAL=m +CONFIG_MLXFW=m +CONFIG_NET_VENDOR_MICREL=y +CONFIG_KS8842=m +CONFIG_KS8851=m +CONFIG_KS8851_MLL=m +CONFIG_KSZ884X_PCI=m +CONFIG_NET_VENDOR_MICROCHIP=y +CONFIG_ENC28J60=m +# CONFIG_ENC28J60_WRITEVERIFY is not set +CONFIG_ENCX24J600=m +CONFIG_LAN743X=m +CONFIG_NET_VENDOR_MICROSEMI=y +CONFIG_MSCC_OCELOT_SWITCH=m +CONFIG_MSCC_OCELOT_SWITCH_OCELOT=m +CONFIG_NET_VENDOR_MYRI=y +CONFIG_MYRI10GE=m +CONFIG_MYRI10GE_DCA=y +CONFIG_FEALNX=m +CONFIG_NET_VENDOR_NATSEMI=y +CONFIG_NATSEMI=m +CONFIG_NS83820=m +CONFIG_NET_VENDOR_NETERION=y +CONFIG_S2IO=m +CONFIG_VXGE=m +# CONFIG_VXGE_DEBUG_TRACE_ALL is not set +CONFIG_NET_VENDOR_NETRONOME=y +CONFIG_NFP=m +CONFIG_NFP_APP_FLOWER=y +CONFIG_NFP_APP_ABM_NIC=y +# CONFIG_NFP_DEBUG is not set +CONFIG_NET_VENDOR_NI=y +CONFIG_NI_XGE_MANAGEMENT_ENET=m +CONFIG_NET_VENDOR_8390=y +CONFIG_PCMCIA_AXNET=m +CONFIG_NE2K_PCI=m +CONFIG_PCMCIA_PCNET=m +CONFIG_NET_VENDOR_NVIDIA=y +CONFIG_FORCEDETH=m +CONFIG_NET_VENDOR_OKI=y +CONFIG_ETHOC=m +CONFIG_NET_VENDOR_PACKET_ENGINES=y +CONFIG_HAMACHI=m +CONFIG_YELLOWFIN=m +CONFIG_NET_VENDOR_PENSANDO=y +CONFIG_IONIC=m +CONFIG_NET_VENDOR_QLOGIC=y +CONFIG_QLA3XXX=m +CONFIG_QLCNIC=m +CONFIG_QLCNIC_SRIOV=y +CONFIG_QLCNIC_DCB=y +CONFIG_QLCNIC_HWMON=y +CONFIG_NETXEN_NIC=m +CONFIG_QED=m +CONFIG_QED_LL2=y +CONFIG_QED_SRIOV=y +CONFIG_QEDE=m +CONFIG_QED_RDMA=y +CONFIG_QED_ISCSI=y +CONFIG_QED_FCOE=y +CONFIG_QED_OOO=y +CONFIG_NET_VENDOR_QUALCOMM=y +CONFIG_QCA7000=m +CONFIG_QCA7000_SPI=m +CONFIG_QCA7000_UART=m +CONFIG_QCOM_EMAC=m +CONFIG_RMNET=m +CONFIG_NET_VENDOR_RDC=y +CONFIG_R6040=m +CONFIG_NET_VENDOR_REALTEK=y +CONFIG_ATP=m +CONFIG_8139CP=m +CONFIG_8139TOO=m +# CONFIG_8139TOO_PIO is not set +CONFIG_8139TOO_TUNE_TWISTER=y +CONFIG_8139TOO_8129=y +# CONFIG_8139_OLD_RX_RESET is not set +CONFIG_R8169=m +CONFIG_NET_VENDOR_RENESAS=y +CONFIG_NET_VENDOR_ROCKER=y +CONFIG_ROCKER=m +CONFIG_NET_VENDOR_SAMSUNG=y +CONFIG_SXGBE_ETH=m +CONFIG_NET_VENDOR_SEEQ=y +CONFIG_NET_VENDOR_SOLARFLARE=y +CONFIG_SFC=m +CONFIG_SFC_MTD=y +CONFIG_SFC_MCDI_MON=y +CONFIG_SFC_SRIOV=y +CONFIG_SFC_MCDI_LOGGING=y +CONFIG_SFC_FALCON=m +CONFIG_SFC_FALCON_MTD=y +CONFIG_NET_VENDOR_SILAN=y +CONFIG_SC92031=m +CONFIG_NET_VENDOR_SIS=y +CONFIG_SIS900=m +CONFIG_SIS190=m +CONFIG_NET_VENDOR_SMSC=y +CONFIG_PCMCIA_SMC91C92=m +CONFIG_EPIC100=m +CONFIG_SMSC911X=m +CONFIG_SMSC9420=m +CONFIG_NET_VENDOR_SOCIONEXT=y +CONFIG_NET_VENDOR_STMICRO=y +CONFIG_STMMAC_ETH=m +# CONFIG_STMMAC_SELFTESTS is not set +CONFIG_STMMAC_PLATFORM=m +CONFIG_DWMAC_DWC_QOS_ETH=m +CONFIG_DWMAC_GENERIC=m +CONFIG_STMMAC_PCI=m +CONFIG_NET_VENDOR_SUN=y +CONFIG_HAPPYMEAL=m +CONFIG_SUNGEM=m +CONFIG_CASSINI=m +CONFIG_NIU=m +CONFIG_NET_VENDOR_SYNOPSYS=y +CONFIG_DWC_XLGMAC=m +CONFIG_DWC_XLGMAC_PCI=m +CONFIG_NET_VENDOR_TEHUTI=y +CONFIG_TEHUTI=m +CONFIG_NET_VENDOR_TI=y +# CONFIG_TI_CPSW_PHY_SEL is not set +CONFIG_TLAN=m +CONFIG_NET_VENDOR_VIA=y +CONFIG_VIA_RHINE=m +CONFIG_VIA_RHINE_MMIO=y +CONFIG_VIA_VELOCITY=m +CONFIG_NET_VENDOR_WIZNET=y +CONFIG_WIZNET_W5100=m +CONFIG_WIZNET_W5300=m +# CONFIG_WIZNET_BUS_DIRECT is not set +# CONFIG_WIZNET_BUS_INDIRECT is not set +CONFIG_WIZNET_BUS_ANY=y +CONFIG_WIZNET_W5100_SPI=m +CONFIG_NET_VENDOR_XILINX=y +CONFIG_XILINX_AXI_EMAC=m +CONFIG_XILINX_LL_TEMAC=m +CONFIG_NET_VENDOR_XIRCOM=y +CONFIG_PCMCIA_XIRC2PS=m +CONFIG_FDDI=m +CONFIG_DEFXX=m +CONFIG_DEFXX_MMIO=y +CONFIG_SKFP=m +# CONFIG_HIPPI is not set +CONFIG_NET_SB1000=m +CONFIG_MDIO_DEVICE=m +CONFIG_MDIO_BUS=m +CONFIG_MDIO_BCM_UNIMAC=m +CONFIG_MDIO_BITBANG=m +CONFIG_MDIO_BUS_MUX=m +CONFIG_MDIO_BUS_MUX_GPIO=m +CONFIG_MDIO_BUS_MUX_MMIOREG=m +CONFIG_MDIO_BUS_MUX_MULTIPLEXER=m +CONFIG_MDIO_CAVIUM=m +CONFIG_MDIO_GPIO=m +CONFIG_MDIO_HISI_FEMAC=m +CONFIG_MDIO_I2C=m +CONFIG_MDIO_MSCC_MIIM=m +CONFIG_MDIO_OCTEON=m +CONFIG_MDIO_THUNDER=m +CONFIG_PHYLINK=m +CONFIG_PHYLIB=m +CONFIG_SWPHY=y +CONFIG_LED_TRIGGER_PHY=y + +# +# MII PHY device drivers +# +CONFIG_SFP=m +CONFIG_ADIN_PHY=m +CONFIG_AMD_PHY=m +CONFIG_AQUANTIA_PHY=m +CONFIG_AX88796B_PHY=m +CONFIG_BCM7XXX_PHY=m +CONFIG_BCM87XX_PHY=m +CONFIG_BCM_NET_PHYLIB=m +CONFIG_BROADCOM_PHY=m +CONFIG_CICADA_PHY=m +CONFIG_CORTINA_PHY=m +CONFIG_DAVICOM_PHY=m +CONFIG_DP83822_PHY=m +CONFIG_DP83TC811_PHY=m +CONFIG_DP83848_PHY=m +CONFIG_DP83867_PHY=m +CONFIG_DP83869_PHY=m +CONFIG_FIXED_PHY=m +CONFIG_ICPLUS_PHY=m +CONFIG_INTEL_XWAY_PHY=m +CONFIG_LSI_ET1011C_PHY=m +CONFIG_LXT_PHY=m +CONFIG_MARVELL_PHY=m +CONFIG_MARVELL_10G_PHY=m +CONFIG_MICREL_PHY=m +CONFIG_MICROCHIP_PHY=m +CONFIG_MICROCHIP_T1_PHY=m +CONFIG_MICROSEMI_PHY=m +CONFIG_NATIONAL_PHY=m +CONFIG_NXP_TJA11XX_PHY=m +CONFIG_AT803X_PHY=m +CONFIG_QSEMI_PHY=m +CONFIG_REALTEK_PHY=m +CONFIG_RENESAS_PHY=m +CONFIG_ROCKCHIP_PHY=m +CONFIG_SMSC_PHY=m +CONFIG_STE10XP=m +CONFIG_TERANETICS_PHY=m +CONFIG_VITESSE_PHY=m +CONFIG_XILINX_GMII2RGMII=m +CONFIG_MICREL_KS8995MA=m +CONFIG_PLIP=m +CONFIG_PPP=m +CONFIG_PPP_BSDCOMP=m +CONFIG_PPP_DEFLATE=m +CONFIG_PPP_FILTER=y +CONFIG_PPP_MPPE=m +CONFIG_PPP_MULTILINK=y +CONFIG_PPPOATM=m +CONFIG_PPPOE=m +CONFIG_PPTP=m +CONFIG_PPPOL2TP=m +CONFIG_PPP_ASYNC=m +CONFIG_PPP_SYNC_TTY=m +CONFIG_SLIP=m +CONFIG_SLHC=m +CONFIG_SLIP_COMPRESSED=y +CONFIG_SLIP_SMART=y +CONFIG_SLIP_MODE_SLIP6=y +CONFIG_USB_NET_DRIVERS=m +CONFIG_USB_CATC=m +CONFIG_USB_KAWETH=m +CONFIG_USB_PEGASUS=m +CONFIG_USB_RTL8150=m +CONFIG_USB_RTL8152=m +CONFIG_USB_LAN78XX=m +CONFIG_USB_USBNET=m +CONFIG_USB_NET_AX8817X=m +CONFIG_USB_NET_AX88179_178A=m +CONFIG_USB_NET_CDCETHER=m +CONFIG_USB_NET_CDC_EEM=m +CONFIG_USB_NET_CDC_NCM=m +CONFIG_USB_NET_HUAWEI_CDC_NCM=m +CONFIG_USB_NET_CDC_MBIM=m +CONFIG_USB_NET_DM9601=m +CONFIG_USB_NET_SR9700=m +CONFIG_USB_NET_SR9800=m +CONFIG_USB_NET_SMSC75XX=m +CONFIG_USB_NET_SMSC95XX=m +CONFIG_USB_NET_GL620A=m +CONFIG_USB_NET_NET1080=m +CONFIG_USB_NET_PLUSB=m +CONFIG_USB_NET_MCS7830=m +CONFIG_USB_NET_RNDIS_HOST=m +CONFIG_USB_NET_CDC_SUBSET_ENABLE=m +CONFIG_USB_NET_CDC_SUBSET=m +CONFIG_USB_ALI_M5632=y +CONFIG_USB_AN2720=y +CONFIG_USB_BELKIN=y +CONFIG_USB_ARMLINUX=y +CONFIG_USB_EPSON2888=y +CONFIG_USB_KC2190=y +CONFIG_USB_NET_ZAURUS=m +CONFIG_USB_NET_CX82310_ETH=m +CONFIG_USB_NET_KALMIA=m +CONFIG_USB_NET_QMI_WWAN=m +CONFIG_USB_HSO=m +CONFIG_USB_NET_INT51X1=m +CONFIG_USB_CDC_PHONET=m +CONFIG_USB_IPHETH=m +CONFIG_USB_SIERRA_NET=m +CONFIG_USB_VL600=m +CONFIG_USB_NET_CH9200=m +CONFIG_USB_NET_AQC111=m +CONFIG_WLAN=y +# CONFIG_WIRELESS_WDS is not set +CONFIG_WLAN_VENDOR_ADMTEK=y +CONFIG_ADM8211=m +CONFIG_ATH_COMMON=m +CONFIG_WLAN_VENDOR_ATH=y +# CONFIG_ATH_DEBUG is not set +CONFIG_ATH5K=m +CONFIG_ATH5K_DEBUG=y +CONFIG_ATH5K_TRACER=y +CONFIG_ATH5K_PCI=y +CONFIG_ATH9K_HW=m +CONFIG_ATH9K_COMMON=m +CONFIG_ATH9K_COMMON_DEBUG=y +CONFIG_ATH9K_BTCOEX_SUPPORT=y +CONFIG_ATH9K=m +CONFIG_ATH9K_PCI=y +CONFIG_ATH9K_AHB=y +CONFIG_ATH9K_DEBUGFS=y +CONFIG_ATH9K_STATION_STATISTICS=y +CONFIG_ATH9K_DYNACK=y +CONFIG_ATH9K_WOW=y +CONFIG_ATH9K_RFKILL=y +CONFIG_ATH9K_CHANNEL_CONTEXT=y +CONFIG_ATH9K_PCOEM=y +CONFIG_ATH9K_PCI_NO_EEPROM=m +CONFIG_ATH9K_HTC=m +CONFIG_ATH9K_HTC_DEBUGFS=y +CONFIG_ATH9K_HWRNG=y +CONFIG_ATH9K_COMMON_SPECTRAL=y +CONFIG_CARL9170=m +CONFIG_CARL9170_LEDS=y +CONFIG_CARL9170_DEBUGFS=y +CONFIG_CARL9170_WPC=y +# CONFIG_CARL9170_HWRNG is not set +CONFIG_ATH6KL=m +CONFIG_ATH6KL_SDIO=m +CONFIG_ATH6KL_USB=m +CONFIG_ATH6KL_DEBUG=y +CONFIG_ATH6KL_TRACING=y +CONFIG_AR5523=m +CONFIG_WIL6210=m +CONFIG_WIL6210_ISR_COR=y +CONFIG_WIL6210_TRACING=y +CONFIG_WIL6210_DEBUGFS=y +CONFIG_ATH10K=m +CONFIG_ATH10K_CE=y +CONFIG_ATH10K_PCI=m +CONFIG_ATH10K_AHB=y +CONFIG_ATH10K_SDIO=m +CONFIG_ATH10K_USB=m +CONFIG_ATH10K_DEBUG=y +CONFIG_ATH10K_DEBUGFS=y +CONFIG_ATH10K_SPECTRAL=y +CONFIG_ATH10K_TRACING=y +CONFIG_WCN36XX=m +CONFIG_WCN36XX_DEBUGFS=y +CONFIG_WLAN_VENDOR_ATMEL=y +CONFIG_ATMEL=m +CONFIG_PCI_ATMEL=m +CONFIG_PCMCIA_ATMEL=m +CONFIG_AT76C50X_USB=m +CONFIG_WLAN_VENDOR_BROADCOM=y +CONFIG_B43=m +CONFIG_B43_BCMA=y +CONFIG_B43_SSB=y +CONFIG_B43_BUSES_BCMA_AND_SSB=y +# CONFIG_B43_BUSES_BCMA is not set +# CONFIG_B43_BUSES_SSB is not set +CONFIG_B43_PCI_AUTOSELECT=y +CONFIG_B43_PCICORE_AUTOSELECT=y +CONFIG_B43_SDIO=y +CONFIG_B43_BCMA_PIO=y +CONFIG_B43_PIO=y +CONFIG_B43_PHY_G=y +CONFIG_B43_PHY_N=y +CONFIG_B43_PHY_LP=y +CONFIG_B43_PHY_HT=y +CONFIG_B43_LEDS=y +CONFIG_B43_HWRNG=y +# CONFIG_B43_DEBUG is not set +CONFIG_B43LEGACY=m +CONFIG_B43LEGACY_PCI_AUTOSELECT=y +CONFIG_B43LEGACY_PCICORE_AUTOSELECT=y +CONFIG_B43LEGACY_LEDS=y +CONFIG_B43LEGACY_HWRNG=y +CONFIG_B43LEGACY_DEBUG=y +CONFIG_B43LEGACY_DMA=y +CONFIG_B43LEGACY_PIO=y +CONFIG_B43LEGACY_DMA_AND_PIO_MODE=y +# CONFIG_B43LEGACY_DMA_MODE is not set +# CONFIG_B43LEGACY_PIO_MODE is not set +CONFIG_BRCMUTIL=m +CONFIG_BRCMSMAC=m +CONFIG_BRCMFMAC=m +CONFIG_BRCMFMAC_PROTO_BCDC=y +CONFIG_BRCMFMAC_PROTO_MSGBUF=y +CONFIG_BRCMFMAC_SDIO=y +CONFIG_BRCMFMAC_USB=y +CONFIG_BRCMFMAC_PCIE=y +CONFIG_BRCM_TRACING=y +CONFIG_BRCMDBG=y +CONFIG_WLAN_VENDOR_CISCO=y +CONFIG_AIRO=m +CONFIG_AIRO_CS=m +CONFIG_WLAN_VENDOR_INTEL=y +CONFIG_IPW2100=m +CONFIG_IPW2100_MONITOR=y +# CONFIG_IPW2100_DEBUG is not set +CONFIG_IPW2200=m +CONFIG_IPW2200_MONITOR=y +CONFIG_IPW2200_RADIOTAP=y +CONFIG_IPW2200_PROMISCUOUS=y +CONFIG_IPW2200_QOS=y +# CONFIG_IPW2200_DEBUG is not set +CONFIG_LIBIPW=m +# CONFIG_LIBIPW_DEBUG is not set +CONFIG_IWLEGACY=m +CONFIG_IWL4965=m +CONFIG_IWL3945=m + +# +# iwl3945 / iwl4965 Debugging Options +# +CONFIG_IWLEGACY_DEBUG=y +CONFIG_IWLEGACY_DEBUGFS=y +# end of iwl3945 / iwl4965 Debugging Options + +CONFIG_IWLWIFI=m +CONFIG_IWLWIFI_LEDS=y +CONFIG_IWLDVM=m +CONFIG_IWLMVM=m +CONFIG_IWLWIFI_OPMODE_MODULAR=y +# CONFIG_IWLWIFI_BCAST_FILTERING is not set + +# +# Debugging Options +# +CONFIG_IWLWIFI_DEBUG=y +CONFIG_IWLWIFI_DEBUGFS=y +CONFIG_IWLWIFI_DEVICE_TRACING=y +# end of Debugging Options + +CONFIG_WLAN_VENDOR_INTERSIL=y +CONFIG_HOSTAP=m +CONFIG_HOSTAP_FIRMWARE=y +CONFIG_HOSTAP_FIRMWARE_NVRAM=y +CONFIG_HOSTAP_PLX=m +CONFIG_HOSTAP_PCI=m +CONFIG_HOSTAP_CS=m +CONFIG_HERMES=m +CONFIG_HERMES_PRISM=y +CONFIG_HERMES_CACHE_FW_ON_INIT=y +CONFIG_PLX_HERMES=m +CONFIG_TMD_HERMES=m +CONFIG_NORTEL_HERMES=m +CONFIG_PCI_HERMES=m +CONFIG_PCMCIA_HERMES=m +CONFIG_PCMCIA_SPECTRUM=m +CONFIG_ORINOCO_USB=m +CONFIG_P54_COMMON=m +CONFIG_P54_USB=m +CONFIG_P54_PCI=m +CONFIG_P54_SPI=m +# CONFIG_P54_SPI_DEFAULT_EEPROM is not set +CONFIG_P54_LEDS=y +CONFIG_PRISM54=m +CONFIG_WLAN_VENDOR_MARVELL=y +CONFIG_LIBERTAS=m +CONFIG_LIBERTAS_USB=m +CONFIG_LIBERTAS_CS=m +CONFIG_LIBERTAS_SDIO=m +CONFIG_LIBERTAS_SPI=m +# CONFIG_LIBERTAS_DEBUG is not set +CONFIG_LIBERTAS_MESH=y +CONFIG_LIBERTAS_THINFIRM=m +# CONFIG_LIBERTAS_THINFIRM_DEBUG is not set +CONFIG_LIBERTAS_THINFIRM_USB=m +CONFIG_MWIFIEX=m +CONFIG_MWIFIEX_SDIO=m +CONFIG_MWIFIEX_PCIE=m +CONFIG_MWIFIEX_USB=m +CONFIG_MWL8K=m +CONFIG_WLAN_VENDOR_MEDIATEK=y +CONFIG_MT7601U=m +CONFIG_MT76_CORE=m +CONFIG_MT76_LEDS=y +CONFIG_MT76_USB=m +CONFIG_MT76x02_LIB=m +CONFIG_MT76x02_USB=m +CONFIG_MT76x0_COMMON=m +CONFIG_MT76x0U=m +CONFIG_MT76x0E=m +CONFIG_MT76x2_COMMON=m +CONFIG_MT76x2E=m +CONFIG_MT76x2U=m +CONFIG_MT7603E=m +CONFIG_MT7615E=m +CONFIG_WLAN_VENDOR_RALINK=y +CONFIG_RT2X00=m +CONFIG_RT2400PCI=m +CONFIG_RT2500PCI=m +CONFIG_RT61PCI=m +CONFIG_RT2800PCI=m +CONFIG_RT2800PCI_RT33XX=y +CONFIG_RT2800PCI_RT35XX=y +CONFIG_RT2800PCI_RT53XX=y +CONFIG_RT2800PCI_RT3290=y +CONFIG_RT2500USB=m +CONFIG_RT73USB=m +CONFIG_RT2800USB=m +CONFIG_RT2800USB_RT33XX=y +CONFIG_RT2800USB_RT35XX=y +CONFIG_RT2800USB_RT3573=y +CONFIG_RT2800USB_RT53XX=y +CONFIG_RT2800USB_RT55XX=y +CONFIG_RT2800USB_UNKNOWN=y +CONFIG_RT2800_LIB=m +CONFIG_RT2800_LIB_MMIO=m +CONFIG_RT2X00_LIB_MMIO=m +CONFIG_RT2X00_LIB_PCI=m +CONFIG_RT2X00_LIB_USB=m +CONFIG_RT2X00_LIB=m +CONFIG_RT2X00_LIB_FIRMWARE=y +CONFIG_RT2X00_LIB_CRYPTO=y +CONFIG_RT2X00_LIB_LEDS=y +CONFIG_RT2X00_LIB_DEBUGFS=y +# CONFIG_RT2X00_DEBUG is not set +CONFIG_WLAN_VENDOR_REALTEK=y +CONFIG_RTL8180=m +CONFIG_RTL8187=m +CONFIG_RTL8187_LEDS=y +CONFIG_RTL_CARDS=m +CONFIG_RTL8192CE=m +CONFIG_RTL8192SE=m +CONFIG_RTL8192DE=m +CONFIG_RTL8723AE=m +CONFIG_RTL8723BE=m +CONFIG_RTL8188EE=m +CONFIG_RTL8192EE=m +CONFIG_RTL8821AE=m +CONFIG_RTL8192CU=m +CONFIG_RTLWIFI=m +CONFIG_RTLWIFI_PCI=m +CONFIG_RTLWIFI_USB=m +CONFIG_RTLWIFI_DEBUG=y +CONFIG_RTL8192C_COMMON=m +CONFIG_RTL8723_COMMON=m +CONFIG_RTLBTCOEXIST=m +CONFIG_RTL8XXXU=m +CONFIG_RTL8XXXU_UNTESTED=y +CONFIG_RTW88=m +CONFIG_RTW88_CORE=m +CONFIG_RTW88_PCI=m +CONFIG_RTW88_8822BE=y +CONFIG_RTW88_8822CE=y +CONFIG_RTW88_DEBUG=y +CONFIG_RTW88_DEBUGFS=y +CONFIG_WLAN_VENDOR_RSI=y +CONFIG_RSI_91X=m +CONFIG_RSI_DEBUGFS=y +CONFIG_RSI_SDIO=m +CONFIG_RSI_USB=m +CONFIG_RSI_COEX=y +CONFIG_WLAN_VENDOR_ST=y +CONFIG_CW1200=m +CONFIG_CW1200_WLAN_SDIO=m +CONFIG_CW1200_WLAN_SPI=m +CONFIG_WLAN_VENDOR_TI=y +CONFIG_WL1251=m +CONFIG_WL1251_SPI=m +CONFIG_WL1251_SDIO=m +CONFIG_WL12XX=m +CONFIG_WL18XX=m +CONFIG_WLCORE=m +CONFIG_WLCORE_SPI=m +CONFIG_WLCORE_SDIO=m +CONFIG_WILINK_PLATFORM_DATA=y +CONFIG_WLAN_VENDOR_ZYDAS=y +CONFIG_USB_ZD1201=m +CONFIG_ZD1211RW=m +# CONFIG_ZD1211RW_DEBUG is not set +CONFIG_WLAN_VENDOR_QUANTENNA=y +CONFIG_QTNFMAC=m +CONFIG_QTNFMAC_PCIE=m +CONFIG_PCMCIA_RAYCS=m +CONFIG_PCMCIA_WL3501=m +CONFIG_MAC80211_HWSIM=m +CONFIG_USB_NET_RNDIS_WLAN=m +CONFIG_VIRT_WIFI=m + +# +# WiMAX Wireless Broadband devices +# +CONFIG_WIMAX_I2400M=m +CONFIG_WIMAX_I2400M_USB=m +CONFIG_WIMAX_I2400M_DEBUG_LEVEL=8 +# end of WiMAX Wireless Broadband devices + +# CONFIG_WAN is not set +CONFIG_IEEE802154_DRIVERS=m +CONFIG_IEEE802154_FAKELB=m +CONFIG_IEEE802154_AT86RF230=m +# CONFIG_IEEE802154_AT86RF230_DEBUGFS is not set +CONFIG_IEEE802154_MRF24J40=m +CONFIG_IEEE802154_CC2520=m +CONFIG_IEEE802154_ATUSB=m +CONFIG_IEEE802154_ADF7242=m +CONFIG_IEEE802154_CA8210=m +# CONFIG_IEEE802154_CA8210_DEBUGFS is not set +CONFIG_IEEE802154_MCR20A=m +CONFIG_IEEE802154_HWSIM=m +CONFIG_XEN_NETDEV_FRONTEND=m +CONFIG_XEN_NETDEV_BACKEND=m +CONFIG_VMXNET3=m +CONFIG_FUJITSU_ES=m +CONFIG_THUNDERBOLT_NET=m +CONFIG_HYPERV_NET=m +CONFIG_NETDEVSIM=m +CONFIG_NET_FAILOVER=m +CONFIG_ISDN=y +CONFIG_ISDN_CAPI=m +CONFIG_CAPI_TRACE=y +CONFIG_ISDN_CAPI_CAPI20=m +CONFIG_ISDN_CAPI_MIDDLEWARE=y +CONFIG_MISDN=m +CONFIG_MISDN_DSP=m +CONFIG_MISDN_L1OIP=m + +# +# mISDN hardware drivers +# +CONFIG_MISDN_HFCPCI=m +CONFIG_MISDN_HFCMULTI=m +CONFIG_MISDN_HFCUSB=m +CONFIG_MISDN_AVMFRITZ=m +CONFIG_MISDN_SPEEDFAX=m +CONFIG_MISDN_INFINEON=m +CONFIG_MISDN_W6692=m +CONFIG_MISDN_NETJET=m +CONFIG_MISDN_HDLC=m +CONFIG_MISDN_IPAC=m +CONFIG_MISDN_ISAR=m +CONFIG_NVM=y +CONFIG_NVM_PBLK=m +# CONFIG_NVM_PBLK_DEBUG is not set + +# +# Input device support +# +CONFIG_INPUT=y +CONFIG_INPUT_LEDS=m +CONFIG_INPUT_FF_MEMLESS=m +CONFIG_INPUT_POLLDEV=m +CONFIG_INPUT_SPARSEKMAP=m +CONFIG_INPUT_MATRIXKMAP=m + +# +# Userland interfaces +# +CONFIG_INPUT_MOUSEDEV=m +CONFIG_INPUT_MOUSEDEV_PSAUX=y +CONFIG_INPUT_MOUSEDEV_SCREEN_X=1024 +CONFIG_INPUT_MOUSEDEV_SCREEN_Y=768 +CONFIG_INPUT_JOYDEV=m +CONFIG_INPUT_EVDEV=m +# CONFIG_INPUT_EVBUG is not set + +# +# Input Device Drivers +# +CONFIG_INPUT_KEYBOARD=y +CONFIG_KEYBOARD_ADC=m +CONFIG_KEYBOARD_ADP5520=m +CONFIG_KEYBOARD_ADP5588=m +CONFIG_KEYBOARD_ADP5589=m +CONFIG_KEYBOARD_APPLESPI=m +CONFIG_KEYBOARD_ATKBD=m +CONFIG_KEYBOARD_QT1050=m +CONFIG_KEYBOARD_QT1070=m +CONFIG_KEYBOARD_QT2160=m +CONFIG_KEYBOARD_DLINK_DIR685=m +CONFIG_KEYBOARD_LKKBD=m +CONFIG_KEYBOARD_GPIO=m +CONFIG_KEYBOARD_GPIO_POLLED=m +CONFIG_KEYBOARD_TCA6416=m +CONFIG_KEYBOARD_TCA8418=m +CONFIG_KEYBOARD_MATRIX=m +CONFIG_KEYBOARD_LM8323=m +CONFIG_KEYBOARD_LM8333=m +CONFIG_KEYBOARD_MAX7359=m +CONFIG_KEYBOARD_MCS=m +CONFIG_KEYBOARD_MPR121=m +CONFIG_KEYBOARD_NEWTON=m +CONFIG_KEYBOARD_OPENCORES=m +CONFIG_KEYBOARD_SAMSUNG=m +CONFIG_KEYBOARD_STOWAWAY=m +CONFIG_KEYBOARD_SUNKBD=m +CONFIG_KEYBOARD_STMPE=m +CONFIG_KEYBOARD_OMAP4=m +CONFIG_KEYBOARD_TC3589X=m +CONFIG_KEYBOARD_TM2_TOUCHKEY=m +CONFIG_KEYBOARD_TWL4030=m +CONFIG_KEYBOARD_XTKBD=m +CONFIG_KEYBOARD_CROS_EC=m +CONFIG_KEYBOARD_CAP11XX=m +CONFIG_KEYBOARD_BCM=m +CONFIG_KEYBOARD_MTK_PMIC=m +CONFIG_INPUT_MOUSE=y +CONFIG_MOUSE_PS2=m +CONFIG_MOUSE_PS2_ALPS=y +CONFIG_MOUSE_PS2_BYD=y +CONFIG_MOUSE_PS2_LOGIPS2PP=y +CONFIG_MOUSE_PS2_SYNAPTICS=y +CONFIG_MOUSE_PS2_SYNAPTICS_SMBUS=y +CONFIG_MOUSE_PS2_CYPRESS=y +CONFIG_MOUSE_PS2_LIFEBOOK=y +CONFIG_MOUSE_PS2_TRACKPOINT=y +CONFIG_MOUSE_PS2_ELANTECH=y +CONFIG_MOUSE_PS2_ELANTECH_SMBUS=y +CONFIG_MOUSE_PS2_SENTELIC=y +CONFIG_MOUSE_PS2_TOUCHKIT=y +CONFIG_MOUSE_PS2_FOCALTECH=y +CONFIG_MOUSE_PS2_VMMOUSE=y +CONFIG_MOUSE_PS2_SMBUS=y +CONFIG_MOUSE_SERIAL=m +CONFIG_MOUSE_APPLETOUCH=m +CONFIG_MOUSE_BCM5974=m +CONFIG_MOUSE_CYAPA=m +CONFIG_MOUSE_ELAN_I2C=m +CONFIG_MOUSE_ELAN_I2C_I2C=y +CONFIG_MOUSE_ELAN_I2C_SMBUS=y +CONFIG_MOUSE_VSXXXAA=m +CONFIG_MOUSE_GPIO=m +CONFIG_MOUSE_SYNAPTICS_I2C=m +CONFIG_MOUSE_SYNAPTICS_USB=m +CONFIG_INPUT_JOYSTICK=y +CONFIG_JOYSTICK_ANALOG=m +CONFIG_JOYSTICK_A3D=m +CONFIG_JOYSTICK_ADI=m +CONFIG_JOYSTICK_COBRA=m +CONFIG_JOYSTICK_GF2K=m +CONFIG_JOYSTICK_GRIP=m +CONFIG_JOYSTICK_GRIP_MP=m +CONFIG_JOYSTICK_GUILLEMOT=m +CONFIG_JOYSTICK_INTERACT=m +CONFIG_JOYSTICK_SIDEWINDER=m +CONFIG_JOYSTICK_TMDC=m +CONFIG_JOYSTICK_IFORCE=m +CONFIG_JOYSTICK_IFORCE_USB=m +CONFIG_JOYSTICK_IFORCE_232=m +CONFIG_JOYSTICK_WARRIOR=m +CONFIG_JOYSTICK_MAGELLAN=m +CONFIG_JOYSTICK_SPACEORB=m +CONFIG_JOYSTICK_SPACEBALL=m +CONFIG_JOYSTICK_STINGER=m +CONFIG_JOYSTICK_TWIDJOY=m +CONFIG_JOYSTICK_ZHENHUA=m +CONFIG_JOYSTICK_DB9=m +CONFIG_JOYSTICK_GAMECON=m +CONFIG_JOYSTICK_TURBOGRAFX=m +CONFIG_JOYSTICK_AS5011=m +CONFIG_JOYSTICK_JOYDUMP=m +CONFIG_JOYSTICK_XPAD=m +CONFIG_JOYSTICK_XPAD_FF=y +CONFIG_JOYSTICK_XPAD_LEDS=y +CONFIG_JOYSTICK_WALKERA0701=m +CONFIG_JOYSTICK_PSXPAD_SPI=m +CONFIG_JOYSTICK_PSXPAD_SPI_FF=y +CONFIG_JOYSTICK_PXRC=m +CONFIG_JOYSTICK_FSIA6B=m +CONFIG_INPUT_TABLET=y +CONFIG_TABLET_USB_ACECAD=m +CONFIG_TABLET_USB_AIPTEK=m +CONFIG_TABLET_USB_GTCO=m +CONFIG_TABLET_USB_HANWANG=m +CONFIG_TABLET_USB_KBTAB=m +CONFIG_TABLET_USB_PEGASUS=m +CONFIG_TABLET_SERIAL_WACOM4=m +CONFIG_INPUT_TOUCHSCREEN=y +CONFIG_TOUCHSCREEN_PROPERTIES=y +CONFIG_TOUCHSCREEN_88PM860X=m +CONFIG_TOUCHSCREEN_ADS7846=m +CONFIG_TOUCHSCREEN_AD7877=m +CONFIG_TOUCHSCREEN_AD7879=m +CONFIG_TOUCHSCREEN_AD7879_I2C=m +CONFIG_TOUCHSCREEN_AD7879_SPI=m +CONFIG_TOUCHSCREEN_ADC=m +CONFIG_TOUCHSCREEN_AR1021_I2C=m +CONFIG_TOUCHSCREEN_ATMEL_MXT=m +CONFIG_TOUCHSCREEN_ATMEL_MXT_T37=y +CONFIG_TOUCHSCREEN_AUO_PIXCIR=m +CONFIG_TOUCHSCREEN_BU21013=m +CONFIG_TOUCHSCREEN_BU21029=m +CONFIG_TOUCHSCREEN_CHIPONE_ICN8318=m +CONFIG_TOUCHSCREEN_CHIPONE_ICN8505=m +CONFIG_TOUCHSCREEN_CY8CTMG110=m +CONFIG_TOUCHSCREEN_CYTTSP_CORE=m +CONFIG_TOUCHSCREEN_CYTTSP_I2C=m +CONFIG_TOUCHSCREEN_CYTTSP_SPI=m +CONFIG_TOUCHSCREEN_CYTTSP4_CORE=m +CONFIG_TOUCHSCREEN_CYTTSP4_I2C=m +CONFIG_TOUCHSCREEN_CYTTSP4_SPI=m +CONFIG_TOUCHSCREEN_DA9034=m +CONFIG_TOUCHSCREEN_DA9052=m +CONFIG_TOUCHSCREEN_DYNAPRO=m +CONFIG_TOUCHSCREEN_HAMPSHIRE=m +CONFIG_TOUCHSCREEN_EETI=m +CONFIG_TOUCHSCREEN_EGALAX=m +CONFIG_TOUCHSCREEN_EGALAX_SERIAL=m +CONFIG_TOUCHSCREEN_EXC3000=m +CONFIG_TOUCHSCREEN_FUJITSU=m +CONFIG_TOUCHSCREEN_GOODIX=m +CONFIG_TOUCHSCREEN_HIDEEP=m +CONFIG_TOUCHSCREEN_ILI210X=m +CONFIG_TOUCHSCREEN_S6SY761=m +CONFIG_TOUCHSCREEN_GUNZE=m +CONFIG_TOUCHSCREEN_EKTF2127=m +CONFIG_TOUCHSCREEN_ELAN=m +CONFIG_TOUCHSCREEN_ELO=m +CONFIG_TOUCHSCREEN_WACOM_W8001=m +CONFIG_TOUCHSCREEN_WACOM_I2C=m +CONFIG_TOUCHSCREEN_MAX11801=m +CONFIG_TOUCHSCREEN_MCS5000=m +CONFIG_TOUCHSCREEN_MMS114=m +CONFIG_TOUCHSCREEN_MELFAS_MIP4=m +CONFIG_TOUCHSCREEN_MTOUCH=m +CONFIG_TOUCHSCREEN_IMX6UL_TSC=m +CONFIG_TOUCHSCREEN_INEXIO=m +CONFIG_TOUCHSCREEN_MK712=m +CONFIG_TOUCHSCREEN_PENMOUNT=m +CONFIG_TOUCHSCREEN_EDT_FT5X06=m +CONFIG_TOUCHSCREEN_TOUCHRIGHT=m +CONFIG_TOUCHSCREEN_TOUCHWIN=m +CONFIG_TOUCHSCREEN_TI_AM335X_TSC=m +CONFIG_TOUCHSCREEN_UCB1400=m +CONFIG_TOUCHSCREEN_PIXCIR=m +CONFIG_TOUCHSCREEN_WDT87XX_I2C=m +CONFIG_TOUCHSCREEN_WM831X=m +CONFIG_TOUCHSCREEN_WM97XX=m +CONFIG_TOUCHSCREEN_WM9705=y +CONFIG_TOUCHSCREEN_WM9712=y +CONFIG_TOUCHSCREEN_WM9713=y +CONFIG_TOUCHSCREEN_USB_COMPOSITE=m +CONFIG_TOUCHSCREEN_MC13783=m +CONFIG_TOUCHSCREEN_USB_EGALAX=y +CONFIG_TOUCHSCREEN_USB_PANJIT=y +CONFIG_TOUCHSCREEN_USB_3M=y +CONFIG_TOUCHSCREEN_USB_ITM=y +CONFIG_TOUCHSCREEN_USB_ETURBO=y +CONFIG_TOUCHSCREEN_USB_GUNZE=y +CONFIG_TOUCHSCREEN_USB_DMC_TSC10=y +CONFIG_TOUCHSCREEN_USB_IRTOUCH=y +CONFIG_TOUCHSCREEN_USB_IDEALTEK=y +CONFIG_TOUCHSCREEN_USB_GENERAL_TOUCH=y +CONFIG_TOUCHSCREEN_USB_GOTOP=y +CONFIG_TOUCHSCREEN_USB_JASTEC=y +CONFIG_TOUCHSCREEN_USB_ELO=y +CONFIG_TOUCHSCREEN_USB_E2I=y +CONFIG_TOUCHSCREEN_USB_ZYTRONIC=y +CONFIG_TOUCHSCREEN_USB_ETT_TC45USB=y +CONFIG_TOUCHSCREEN_USB_NEXIO=y +CONFIG_TOUCHSCREEN_USB_EASYTOUCH=y +CONFIG_TOUCHSCREEN_TOUCHIT213=m +CONFIG_TOUCHSCREEN_TSC_SERIO=m +CONFIG_TOUCHSCREEN_TSC200X_CORE=m +CONFIG_TOUCHSCREEN_TSC2004=m +CONFIG_TOUCHSCREEN_TSC2005=m +CONFIG_TOUCHSCREEN_TSC2007=m +CONFIG_TOUCHSCREEN_TSC2007_IIO=y +CONFIG_TOUCHSCREEN_PCAP=m +CONFIG_TOUCHSCREEN_RM_TS=m +CONFIG_TOUCHSCREEN_SILEAD=m +CONFIG_TOUCHSCREEN_SIS_I2C=m +CONFIG_TOUCHSCREEN_ST1232=m +CONFIG_TOUCHSCREEN_STMFTS=m +CONFIG_TOUCHSCREEN_STMPE=m +CONFIG_TOUCHSCREEN_SUR40=m +CONFIG_TOUCHSCREEN_SURFACE3_SPI=m +CONFIG_TOUCHSCREEN_SX8654=m +CONFIG_TOUCHSCREEN_TPS6507X=m +CONFIG_TOUCHSCREEN_ZET6223=m +CONFIG_TOUCHSCREEN_ZFORCE=m +CONFIG_TOUCHSCREEN_COLIBRI_VF50=m +CONFIG_TOUCHSCREEN_ROHM_BU21023=m +CONFIG_TOUCHSCREEN_IQS5XX=m +CONFIG_INPUT_MISC=y +CONFIG_INPUT_88PM860X_ONKEY=m +CONFIG_INPUT_88PM80X_ONKEY=m +CONFIG_INPUT_AD714X=m +CONFIG_INPUT_AD714X_I2C=m +CONFIG_INPUT_AD714X_SPI=m +CONFIG_INPUT_ARIZONA_HAPTICS=m +CONFIG_INPUT_ATMEL_CAPTOUCH=m +CONFIG_INPUT_BMA150=m +CONFIG_INPUT_E3X0_BUTTON=m +CONFIG_INPUT_MSM_VIBRATOR=m +CONFIG_INPUT_PCSPKR=m +CONFIG_INPUT_MAX77650_ONKEY=m +CONFIG_INPUT_MAX77693_HAPTIC=m +CONFIG_INPUT_MAX8925_ONKEY=m +CONFIG_INPUT_MAX8997_HAPTIC=m +CONFIG_INPUT_MC13783_PWRBUTTON=m +CONFIG_INPUT_MMA8450=m +CONFIG_INPUT_APANEL=m +CONFIG_INPUT_GP2A=m +CONFIG_INPUT_GPIO_BEEPER=m +CONFIG_INPUT_GPIO_DECODER=m +CONFIG_INPUT_GPIO_VIBRA=m +CONFIG_INPUT_CPCAP_PWRBUTTON=m +CONFIG_INPUT_ATLAS_BTNS=m +CONFIG_INPUT_ATI_REMOTE2=m +CONFIG_INPUT_KEYSPAN_REMOTE=m +CONFIG_INPUT_KXTJ9=m +CONFIG_INPUT_POWERMATE=m +CONFIG_INPUT_YEALINK=m +CONFIG_INPUT_CM109=m +CONFIG_INPUT_REGULATOR_HAPTIC=m +CONFIG_INPUT_RETU_PWRBUTTON=m +CONFIG_INPUT_TPS65218_PWRBUTTON=m +CONFIG_INPUT_AXP20X_PEK=m +CONFIG_INPUT_TWL4030_PWRBUTTON=m +CONFIG_INPUT_TWL4030_VIBRA=m +CONFIG_INPUT_TWL6040_VIBRA=m +CONFIG_INPUT_UINPUT=m +CONFIG_INPUT_PALMAS_PWRBUTTON=m +CONFIG_INPUT_PCF50633_PMU=m +CONFIG_INPUT_PCF8574=m +CONFIG_INPUT_PWM_BEEPER=m +CONFIG_INPUT_PWM_VIBRA=m +CONFIG_INPUT_RK805_PWRKEY=m +CONFIG_INPUT_GPIO_ROTARY_ENCODER=m +CONFIG_INPUT_DA9052_ONKEY=m +CONFIG_INPUT_DA9055_ONKEY=m +CONFIG_INPUT_DA9063_ONKEY=m +CONFIG_INPUT_WM831X_ON=m +CONFIG_INPUT_PCAP=m +CONFIG_INPUT_ADXL34X=m +CONFIG_INPUT_ADXL34X_I2C=m +CONFIG_INPUT_ADXL34X_SPI=m +CONFIG_INPUT_IMS_PCU=m +CONFIG_INPUT_CMA3000=m +CONFIG_INPUT_CMA3000_I2C=m +CONFIG_INPUT_XEN_KBDDEV_FRONTEND=m +CONFIG_INPUT_IDEAPAD_SLIDEBAR=m +CONFIG_INPUT_SOC_BUTTON_ARRAY=m +CONFIG_INPUT_DRV260X_HAPTICS=m +CONFIG_INPUT_DRV2665_HAPTICS=m +CONFIG_INPUT_DRV2667_HAPTICS=m +CONFIG_INPUT_RAVE_SP_PWRBUTTON=m +CONFIG_INPUT_STPMIC1_ONKEY=m +CONFIG_RMI4_CORE=m +CONFIG_RMI4_I2C=m +CONFIG_RMI4_SPI=m +CONFIG_RMI4_SMB=m +CONFIG_RMI4_F03=y +CONFIG_RMI4_F03_SERIO=m +CONFIG_RMI4_2D_SENSOR=y +CONFIG_RMI4_F11=y +CONFIG_RMI4_F12=y +CONFIG_RMI4_F30=y +CONFIG_RMI4_F34=y +# CONFIG_RMI4_F54 is not set +CONFIG_RMI4_F55=y + +# +# Hardware I/O ports +# +CONFIG_SERIO=m +CONFIG_ARCH_MIGHT_HAVE_PC_SERIO=y +CONFIG_SERIO_I8042=m +CONFIG_SERIO_SERPORT=m +CONFIG_SERIO_CT82C710=m +CONFIG_SERIO_PARKBD=m +CONFIG_SERIO_PCIPS2=m +CONFIG_SERIO_LIBPS2=m +CONFIG_SERIO_RAW=m +CONFIG_SERIO_ALTERA_PS2=m +CONFIG_SERIO_PS2MULT=m +CONFIG_SERIO_ARC_PS2=m +# CONFIG_SERIO_APBPS2 is not set +CONFIG_HYPERV_KEYBOARD=m +CONFIG_SERIO_GPIO_PS2=m +CONFIG_USERIO=m +CONFIG_GAMEPORT=m +CONFIG_GAMEPORT_NS558=m +CONFIG_GAMEPORT_L4=m +CONFIG_GAMEPORT_EMU10K1=m +CONFIG_GAMEPORT_FM801=m +# end of Hardware I/O ports +# end of Input device support + +# +# Character devices +# +CONFIG_TTY=y +CONFIG_VT=y +CONFIG_CONSOLE_TRANSLATIONS=y +CONFIG_VT_CONSOLE=y +CONFIG_VT_CONSOLE_SLEEP=y +CONFIG_HW_CONSOLE=y +CONFIG_VT_HW_CONSOLE_BINDING=y +CONFIG_UNIX98_PTYS=y +# CONFIG_LEGACY_PTYS is not set +CONFIG_SERIAL_NONSTANDARD=y +CONFIG_ROCKETPORT=m +CONFIG_CYCLADES=m +CONFIG_CYZ_INTR=y +CONFIG_MOXA_INTELLIO=m +CONFIG_MOXA_SMARTIO=m +CONFIG_SYNCLINK=m +CONFIG_SYNCLINKMP=m +CONFIG_SYNCLINK_GT=m +CONFIG_NOZOMI=m +CONFIG_ISI=m +CONFIG_N_HDLC=m +CONFIG_N_GSM=m +CONFIG_TRACE_ROUTER=m +CONFIG_TRACE_SINK=m +CONFIG_NULL_TTY=m +CONFIG_LDISC_AUTOLOAD=y +CONFIG_DEVMEM=y +# CONFIG_DEVKMEM is not set + +# +# Serial drivers +# +CONFIG_SERIAL_EARLYCON=y +CONFIG_SERIAL_8250=y +# CONFIG_SERIAL_8250_DEPRECATED_OPTIONS is not set +CONFIG_SERIAL_8250_PNP=y +CONFIG_SERIAL_8250_FINTEK=y +CONFIG_SERIAL_8250_CONSOLE=y +CONFIG_SERIAL_8250_DMA=y +CONFIG_SERIAL_8250_PCI=y +CONFIG_SERIAL_8250_EXAR=m +CONFIG_SERIAL_8250_CS=m +CONFIG_SERIAL_8250_MEN_MCB=m +CONFIG_SERIAL_8250_NR_UARTS=32 +CONFIG_SERIAL_8250_RUNTIME_UARTS=4 +CONFIG_SERIAL_8250_EXTENDED=y +CONFIG_SERIAL_8250_MANY_PORTS=y +CONFIG_SERIAL_8250_ASPEED_VUART=m +CONFIG_SERIAL_8250_SHARE_IRQ=y +# CONFIG_SERIAL_8250_DETECT_IRQ is not set +CONFIG_SERIAL_8250_RSA=y +CONFIG_SERIAL_8250_DWLIB=y +CONFIG_SERIAL_8250_DW=m +CONFIG_SERIAL_8250_RT288X=y +CONFIG_SERIAL_8250_LPSS=y +CONFIG_SERIAL_8250_MID=y +CONFIG_SERIAL_OF_PLATFORM=m + +# +# Non-8250 serial port support +# +CONFIG_SERIAL_MAX3100=m +CONFIG_SERIAL_MAX310X=m +CONFIG_SERIAL_UARTLITE=m +CONFIG_SERIAL_UARTLITE_NR_UARTS=1 +CONFIG_SERIAL_CORE=y +CONFIG_SERIAL_CORE_CONSOLE=y +CONFIG_SERIAL_JSM=m +CONFIG_SERIAL_SIFIVE=m +CONFIG_SERIAL_SCCNXP=m +CONFIG_SERIAL_SC16IS7XX_CORE=m +CONFIG_SERIAL_SC16IS7XX=m +CONFIG_SERIAL_SC16IS7XX_I2C=y +CONFIG_SERIAL_SC16IS7XX_SPI=y +CONFIG_SERIAL_ALTERA_JTAGUART=m +CONFIG_SERIAL_ALTERA_UART=m +CONFIG_SERIAL_ALTERA_UART_MAXPORTS=4 +CONFIG_SERIAL_ALTERA_UART_BAUDRATE=115200 +CONFIG_SERIAL_IFX6X60=m +CONFIG_SERIAL_XILINX_PS_UART=m +CONFIG_SERIAL_ARC=m +CONFIG_SERIAL_ARC_NR_PORTS=1 +CONFIG_SERIAL_RP2=m +CONFIG_SERIAL_RP2_NR_UARTS=32 +CONFIG_SERIAL_FSL_LPUART=m +CONFIG_SERIAL_FSL_LINFLEXUART=m +CONFIG_SERIAL_CONEXANT_DIGICOLOR=m +CONFIG_SERIAL_MEN_Z135=m +# end of Serial drivers + +CONFIG_SERIAL_MCTRL_GPIO=y +CONFIG_SERIAL_DEV_BUS=y +CONFIG_SERIAL_DEV_CTRL_TTYPORT=y +# CONFIG_TTY_PRINTK is not set +CONFIG_PRINTER=m +# CONFIG_LP_CONSOLE is not set +CONFIG_PPDEV=m +CONFIG_HVC_DRIVER=y +CONFIG_HVC_IRQ=y +CONFIG_HVC_XEN=y +CONFIG_HVC_XEN_FRONTEND=y +CONFIG_VIRTIO_CONSOLE=m +CONFIG_IPMI_HANDLER=m +CONFIG_IPMI_DMI_DECODE=y +CONFIG_IPMI_PLAT_DATA=y +# CONFIG_IPMI_PANIC_EVENT is not set +CONFIG_IPMI_DEVICE_INTERFACE=m +CONFIG_IPMI_SI=m +CONFIG_IPMI_SSIF=m +CONFIG_IPMI_WATCHDOG=m +CONFIG_IPMI_POWEROFF=m +CONFIG_IPMB_DEVICE_INTERFACE=m +CONFIG_HW_RANDOM=m +CONFIG_HW_RANDOM_TIMERIOMEM=m +CONFIG_HW_RANDOM_INTEL=m +CONFIG_HW_RANDOM_AMD=m +CONFIG_HW_RANDOM_VIA=m +CONFIG_HW_RANDOM_VIRTIO=m +CONFIG_NVRAM=m +CONFIG_APPLICOM=m + +# +# PCMCIA character devices +# +CONFIG_SYNCLINK_CS=m +CONFIG_CARDMAN_4000=m +CONFIG_CARDMAN_4040=m +CONFIG_SCR24X=m +CONFIG_IPWIRELESS=m +# end of PCMCIA character devices + +CONFIG_MWAVE=m +CONFIG_RAW_DRIVER=m +CONFIG_MAX_RAW_DEVS=256 +CONFIG_HPET=y +CONFIG_HPET_MMAP=y +CONFIG_HPET_MMAP_DEFAULT=y +CONFIG_HANGCHECK_TIMER=m +CONFIG_TCG_TPM=m +CONFIG_HW_RANDOM_TPM=y +CONFIG_TCG_TIS_CORE=m +CONFIG_TCG_TIS=m +CONFIG_TCG_TIS_SPI=m +CONFIG_TCG_TIS_SPI_CR50=y +CONFIG_TCG_TIS_I2C_ATMEL=m +CONFIG_TCG_TIS_I2C_INFINEON=m +CONFIG_TCG_TIS_I2C_NUVOTON=m +CONFIG_TCG_NSC=m +CONFIG_TCG_ATMEL=m +CONFIG_TCG_INFINEON=m +CONFIG_TCG_XEN=m +CONFIG_TCG_CRB=m +CONFIG_TCG_VTPM_PROXY=m +CONFIG_TCG_TIS_ST33ZP24=m +CONFIG_TCG_TIS_ST33ZP24_I2C=m +CONFIG_TCG_TIS_ST33ZP24_SPI=m +CONFIG_TELCLOCK=m +CONFIG_DEVPORT=y +CONFIG_XILLYBUS=m +CONFIG_XILLYBUS_PCIE=m +CONFIG_XILLYBUS_OF=m +# end of Character devices + +# CONFIG_RANDOM_TRUST_CPU is not set +# CONFIG_RANDOM_TRUST_BOOTLOADER is not set + +# +# I2C support +# +CONFIG_I2C=y +CONFIG_ACPI_I2C_OPREGION=y +CONFIG_I2C_BOARDINFO=y +CONFIG_I2C_COMPAT=y +CONFIG_I2C_CHARDEV=m +CONFIG_I2C_MUX=m + +# +# Multiplexer I2C Chip support +# +CONFIG_I2C_ARB_GPIO_CHALLENGE=m +CONFIG_I2C_MUX_GPIO=m +CONFIG_I2C_MUX_GPMUX=m +CONFIG_I2C_MUX_LTC4306=m +CONFIG_I2C_MUX_PCA9541=m +CONFIG_I2C_MUX_PCA954x=m +CONFIG_I2C_MUX_PINCTRL=m +CONFIG_I2C_MUX_REG=m +CONFIG_I2C_DEMUX_PINCTRL=m +CONFIG_I2C_MUX_MLXCPLD=m +# end of Multiplexer I2C Chip support + +CONFIG_I2C_HELPER_AUTO=y +CONFIG_I2C_SMBUS=m +CONFIG_I2C_ALGOBIT=m +CONFIG_I2C_ALGOPCA=m + +# +# I2C Hardware Bus support +# + +# +# PC SMBus host controller drivers +# +CONFIG_I2C_ALI1535=m +CONFIG_I2C_ALI1563=m +CONFIG_I2C_ALI15X3=m +CONFIG_I2C_AMD756=m +CONFIG_I2C_AMD756_S4882=m +CONFIG_I2C_AMD8111=m +CONFIG_I2C_AMD_MP2=m +CONFIG_I2C_I801=m +CONFIG_I2C_ISCH=m +CONFIG_I2C_ISMT=m +CONFIG_I2C_PIIX4=m +CONFIG_I2C_CHT_WC=m +CONFIG_I2C_NFORCE2=m +CONFIG_I2C_NFORCE2_S4985=m +CONFIG_I2C_NVIDIA_GPU=m +CONFIG_I2C_SIS5595=m +CONFIG_I2C_SIS630=m +CONFIG_I2C_SIS96X=m +CONFIG_I2C_VIA=m +CONFIG_I2C_VIAPRO=m + +# +# ACPI drivers +# +CONFIG_I2C_SCMI=m + +# +# I2C system bus drivers (mostly embedded / system-on-chip) +# +CONFIG_I2C_CBUS_GPIO=m +CONFIG_I2C_DESIGNWARE_CORE=y +CONFIG_I2C_DESIGNWARE_PLATFORM=y +CONFIG_I2C_DESIGNWARE_SLAVE=y +CONFIG_I2C_DESIGNWARE_PCI=m +CONFIG_I2C_DESIGNWARE_BAYTRAIL=y +CONFIG_I2C_EMEV2=m +CONFIG_I2C_GPIO=m +# CONFIG_I2C_GPIO_FAULT_INJECTOR is not set +CONFIG_I2C_KEMPLD=m +CONFIG_I2C_OCORES=m +CONFIG_I2C_PCA_PLATFORM=m +CONFIG_I2C_RK3X=m +CONFIG_I2C_SIMTEC=m +CONFIG_I2C_XILINX=m + +# +# External I2C/SMBus adapter drivers +# +CONFIG_I2C_DIOLAN_U2C=m +CONFIG_I2C_DLN2=m +CONFIG_I2C_PARPORT=m +CONFIG_I2C_PARPORT_LIGHT=m +CONFIG_I2C_ROBOTFUZZ_OSIF=m +CONFIG_I2C_TAOS_EVM=m +CONFIG_I2C_TINY_USB=m +CONFIG_I2C_VIPERBOARD=m + +# +# Other I2C/SMBus bus drivers +# +CONFIG_I2C_MLXCPLD=m +CONFIG_I2C_CROS_EC_TUNNEL=m +CONFIG_I2C_FSI=m +# end of I2C Hardware Bus support + +# CONFIG_I2C_STUB is not set +CONFIG_I2C_SLAVE=y +CONFIG_I2C_SLAVE_EEPROM=m +# CONFIG_I2C_DEBUG_CORE is not set +# CONFIG_I2C_DEBUG_ALGO is not set +# CONFIG_I2C_DEBUG_BUS is not set +# end of I2C support + +CONFIG_I3C=m +CONFIG_CDNS_I3C_MASTER=m +CONFIG_DW_I3C_MASTER=m +CONFIG_SPI=y +# CONFIG_SPI_DEBUG is not set +CONFIG_SPI_MASTER=y +CONFIG_SPI_MEM=y + +# +# SPI Master Controller Drivers +# +CONFIG_SPI_ALTERA=m +CONFIG_SPI_AXI_SPI_ENGINE=m +CONFIG_SPI_BITBANG=m +CONFIG_SPI_BUTTERFLY=m +CONFIG_SPI_CADENCE=m +CONFIG_SPI_DESIGNWARE=m +CONFIG_SPI_DW_PCI=m +CONFIG_SPI_DW_MID_DMA=y +CONFIG_SPI_DW_MMIO=m +CONFIG_SPI_DLN2=m +CONFIG_SPI_NXP_FLEXSPI=m +CONFIG_SPI_GPIO=m +CONFIG_SPI_LM70_LLP=m +CONFIG_SPI_FSL_LIB=m +CONFIG_SPI_FSL_SPI=m +CONFIG_SPI_OC_TINY=m +CONFIG_SPI_PXA2XX=m +CONFIG_SPI_PXA2XX_PCI=m +CONFIG_SPI_ROCKCHIP=m +CONFIG_SPI_SC18IS602=m +CONFIG_SPI_SIFIVE=m +CONFIG_SPI_MXIC=m +CONFIG_SPI_XCOMM=m +CONFIG_SPI_XILINX=m +CONFIG_SPI_ZYNQMP_GQSPI=m + +# +# SPI Protocol Masters +# +CONFIG_SPI_SPIDEV=m +CONFIG_SPI_LOOPBACK_TEST=m +CONFIG_SPI_TLE62X0=m +CONFIG_SPI_SLAVE=y +CONFIG_SPI_SLAVE_TIME=m +CONFIG_SPI_SLAVE_SYSTEM_CONTROL=m +CONFIG_SPMI=m +CONFIG_HSI=m +CONFIG_HSI_BOARDINFO=y + +# +# HSI controllers +# + +# +# HSI clients +# +CONFIG_HSI_CHAR=m +CONFIG_PPS=y +# CONFIG_PPS_DEBUG is not set + +# +# PPS clients support +# +CONFIG_PPS_CLIENT_KTIMER=m +CONFIG_PPS_CLIENT_LDISC=m +CONFIG_PPS_CLIENT_PARPORT=m +CONFIG_PPS_CLIENT_GPIO=m + +# +# PPS generators support +# + +# +# PTP clock support +# +CONFIG_PTP_1588_CLOCK=y +CONFIG_DP83640_PHY=m +CONFIG_PTP_1588_CLOCK_KVM=m +CONFIG_PTP_1588_CLOCK_IDTCM=m +# end of PTP clock support + +CONFIG_PINCTRL=y +CONFIG_GENERIC_PINCTRL_GROUPS=y +CONFIG_PINMUX=y +CONFIG_GENERIC_PINMUX_FUNCTIONS=y +CONFIG_PINCONF=y +CONFIG_GENERIC_PINCONF=y +# CONFIG_DEBUG_PINCTRL is not set +CONFIG_PINCTRL_AS3722=m +CONFIG_PINCTRL_AXP209=m +CONFIG_PINCTRL_AMD=m +CONFIG_PINCTRL_MCP23S08=m +CONFIG_PINCTRL_SINGLE=m +CONFIG_PINCTRL_SX150X=y +CONFIG_PINCTRL_STMFX=m +CONFIG_PINCTRL_MAX77620=m +CONFIG_PINCTRL_PALMAS=m +CONFIG_PINCTRL_RK805=m +CONFIG_PINCTRL_OCELOT=y +CONFIG_PINCTRL_BAYTRAIL=y +CONFIG_PINCTRL_CHERRYVIEW=y +CONFIG_PINCTRL_INTEL=y +CONFIG_PINCTRL_BROXTON=y +CONFIG_PINCTRL_CANNONLAKE=y +CONFIG_PINCTRL_CEDARFORK=y +CONFIG_PINCTRL_DENVERTON=y +CONFIG_PINCTRL_GEMINILAKE=y +CONFIG_PINCTRL_ICELAKE=y +CONFIG_PINCTRL_LEWISBURG=y +CONFIG_PINCTRL_SUNRISEPOINT=y +CONFIG_PINCTRL_TIGERLAKE=y +CONFIG_PINCTRL_LOCHNAGAR=m +CONFIG_PINCTRL_MADERA=m +CONFIG_PINCTRL_CS47L15=y +CONFIG_PINCTRL_CS47L35=y +CONFIG_PINCTRL_CS47L85=y +CONFIG_PINCTRL_CS47L90=y +CONFIG_PINCTRL_CS47L92=y +CONFIG_PINCTRL_EQUILIBRIUM=m +CONFIG_GPIOLIB=y +CONFIG_GPIOLIB_FASTPATH_LIMIT=512 +CONFIG_OF_GPIO=y +CONFIG_GPIO_ACPI=y +CONFIG_GPIOLIB_IRQCHIP=y +# CONFIG_DEBUG_GPIO is not set +CONFIG_GPIO_SYSFS=y +CONFIG_GPIO_GENERIC=y +CONFIG_GPIO_MAX730X=m + +# +# Memory mapped GPIO drivers +# +CONFIG_GPIO_74XX_MMIO=m +CONFIG_GPIO_ALTERA=m +CONFIG_GPIO_AMDPT=m +CONFIG_GPIO_CADENCE=m +CONFIG_GPIO_DWAPB=m +CONFIG_GPIO_EXAR=m +CONFIG_GPIO_FTGPIO010=y +CONFIG_GPIO_GENERIC_PLATFORM=m +CONFIG_GPIO_GRGPIO=m +CONFIG_GPIO_HLWD=m +CONFIG_GPIO_ICH=m +CONFIG_GPIO_LYNXPOINT=m +CONFIG_GPIO_MB86S7X=m +CONFIG_GPIO_MENZ127=m +CONFIG_GPIO_SAMA5D2_PIOBU=m +CONFIG_GPIO_SIOX=m +CONFIG_GPIO_SYSCON=m +CONFIG_GPIO_VX855=m +CONFIG_GPIO_XILINX=m +CONFIG_GPIO_AMD_FCH=m +# end of Memory mapped GPIO drivers + +# +# Port-mapped I/O GPIO drivers +# +CONFIG_GPIO_F7188X=m +CONFIG_GPIO_IT87=m +CONFIG_GPIO_SCH=m +CONFIG_GPIO_SCH311X=m +CONFIG_GPIO_WINBOND=m +CONFIG_GPIO_WS16C48=m +# end of Port-mapped I/O GPIO drivers + +# +# I2C GPIO expanders +# +CONFIG_GPIO_ADP5588=m +CONFIG_GPIO_ADNP=m +CONFIG_GPIO_GW_PLD=m +CONFIG_GPIO_MAX7300=m +CONFIG_GPIO_MAX732X=m +CONFIG_GPIO_PCA953X=m +CONFIG_GPIO_PCF857X=m +CONFIG_GPIO_TPIC2810=m +# end of I2C GPIO expanders + +# +# MFD GPIO expanders +# +CONFIG_GPIO_ADP5520=m +CONFIG_GPIO_ARIZONA=m +CONFIG_GPIO_BD70528=m +CONFIG_GPIO_BD9571MWV=m +CONFIG_GPIO_CRYSTAL_COVE=m +CONFIG_GPIO_DA9052=m +CONFIG_GPIO_DA9055=m +CONFIG_GPIO_DLN2=m +CONFIG_GPIO_JANZ_TTL=m +CONFIG_GPIO_KEMPLD=m +CONFIG_GPIO_LP3943=m +CONFIG_GPIO_LP873X=m +CONFIG_GPIO_LP87565=m +CONFIG_GPIO_MADERA=m +CONFIG_GPIO_MAX77620=m +CONFIG_GPIO_MAX77650=m +CONFIG_GPIO_PALMAS=y +CONFIG_GPIO_RC5T583=y +CONFIG_GPIO_STMPE=y +CONFIG_GPIO_TC3589X=y +CONFIG_GPIO_TPS65086=m +CONFIG_GPIO_TPS65218=m +CONFIG_GPIO_TPS6586X=y +CONFIG_GPIO_TPS65910=y +CONFIG_GPIO_TPS65912=m +CONFIG_GPIO_TPS68470=y +CONFIG_GPIO_TQMX86=m +CONFIG_GPIO_TWL4030=m +CONFIG_GPIO_TWL6040=m +CONFIG_GPIO_UCB1400=m +CONFIG_GPIO_WHISKEY_COVE=m +CONFIG_GPIO_WM831X=m +CONFIG_GPIO_WM8350=m +CONFIG_GPIO_WM8994=m +# end of MFD GPIO expanders + +# +# PCI GPIO expanders +# +CONFIG_GPIO_AMD8111=m +CONFIG_GPIO_ML_IOH=m +CONFIG_GPIO_PCI_IDIO_16=m +CONFIG_GPIO_PCIE_IDIO_24=m +CONFIG_GPIO_RDC321X=m +CONFIG_GPIO_SODAVILLE=y +# end of PCI GPIO expanders + +# +# SPI GPIO expanders +# +CONFIG_GPIO_74X164=m +CONFIG_GPIO_MAX3191X=m +CONFIG_GPIO_MAX7301=m +CONFIG_GPIO_MC33880=m +CONFIG_GPIO_PISOSR=m +CONFIG_GPIO_XRA1403=m +CONFIG_GPIO_MOXTET=m +# end of SPI GPIO expanders + +# +# USB GPIO expanders +# +CONFIG_GPIO_VIPERBOARD=m +# end of USB GPIO expanders + +CONFIG_GPIO_MOCKUP=m +CONFIG_W1=m +CONFIG_W1_CON=y + +# +# 1-wire Bus Masters +# +CONFIG_W1_MASTER_MATROX=m +CONFIG_W1_MASTER_DS2490=m +CONFIG_W1_MASTER_DS2482=m +CONFIG_W1_MASTER_DS1WM=m +CONFIG_W1_MASTER_GPIO=m +CONFIG_W1_MASTER_SGI=m +# end of 1-wire Bus Masters + +# +# 1-wire Slaves +# +CONFIG_W1_SLAVE_THERM=m +CONFIG_W1_SLAVE_SMEM=m +CONFIG_W1_SLAVE_DS2405=m +CONFIG_W1_SLAVE_DS2408=m +# CONFIG_W1_SLAVE_DS2408_READBACK is not set +CONFIG_W1_SLAVE_DS2413=m +CONFIG_W1_SLAVE_DS2406=m +CONFIG_W1_SLAVE_DS2423=m +CONFIG_W1_SLAVE_DS2805=m +CONFIG_W1_SLAVE_DS2430=m +CONFIG_W1_SLAVE_DS2431=m +CONFIG_W1_SLAVE_DS2433=m +# CONFIG_W1_SLAVE_DS2433_CRC is not set +CONFIG_W1_SLAVE_DS2438=m +CONFIG_W1_SLAVE_DS250X=m +CONFIG_W1_SLAVE_DS2780=m +CONFIG_W1_SLAVE_DS2781=m +CONFIG_W1_SLAVE_DS28E04=m +CONFIG_W1_SLAVE_DS28E17=m +# end of 1-wire Slaves + +CONFIG_POWER_AVS=y +CONFIG_POWER_RESET=y +CONFIG_POWER_RESET_AS3722=y +CONFIG_POWER_RESET_GPIO=y +CONFIG_POWER_RESET_GPIO_RESTART=y +CONFIG_POWER_RESET_LTC2952=y +CONFIG_POWER_RESET_MT6323=y +CONFIG_POWER_RESET_RESTART=y +CONFIG_POWER_RESET_SYSCON=y +CONFIG_POWER_RESET_SYSCON_POWEROFF=y +CONFIG_REBOOT_MODE=m +CONFIG_SYSCON_REBOOT_MODE=m +CONFIG_NVMEM_REBOOT_MODE=m +CONFIG_POWER_SUPPLY=y +# CONFIG_POWER_SUPPLY_DEBUG is not set +CONFIG_POWER_SUPPLY_HWMON=y +CONFIG_PDA_POWER=m +CONFIG_GENERIC_ADC_BATTERY=m +CONFIG_MAX8925_POWER=m +CONFIG_WM831X_BACKUP=m +CONFIG_WM831X_POWER=m +CONFIG_WM8350_POWER=m +CONFIG_TEST_POWER=m +CONFIG_BATTERY_88PM860X=m +CONFIG_CHARGER_ADP5061=m +CONFIG_BATTERY_ACT8945A=m +CONFIG_BATTERY_CPCAP=m +CONFIG_BATTERY_DS2760=m +CONFIG_BATTERY_DS2780=m +CONFIG_BATTERY_DS2781=m +CONFIG_BATTERY_DS2782=m +CONFIG_BATTERY_LEGO_EV3=m +CONFIG_BATTERY_SBS=m +CONFIG_CHARGER_SBS=m +CONFIG_MANAGER_SBS=m +CONFIG_BATTERY_BQ27XXX=m +CONFIG_BATTERY_BQ27XXX_I2C=m +CONFIG_BATTERY_BQ27XXX_HDQ=m +# CONFIG_BATTERY_BQ27XXX_DT_UPDATES_NVM is not set +CONFIG_BATTERY_DA9030=m +CONFIG_BATTERY_DA9052=m +CONFIG_CHARGER_DA9150=m +CONFIG_BATTERY_DA9150=m +CONFIG_CHARGER_AXP20X=m +CONFIG_BATTERY_AXP20X=m +CONFIG_AXP20X_POWER=m +CONFIG_AXP288_CHARGER=m +CONFIG_AXP288_FUEL_GAUGE=m +CONFIG_BATTERY_MAX17040=m +CONFIG_BATTERY_MAX17042=m +CONFIG_BATTERY_MAX1721X=m +CONFIG_BATTERY_TWL4030_MADC=m +CONFIG_CHARGER_88PM860X=m +CONFIG_CHARGER_PCF50633=m +CONFIG_BATTERY_RX51=m +CONFIG_CHARGER_ISP1704=m +CONFIG_CHARGER_MAX8903=m +CONFIG_CHARGER_TWL4030=m +CONFIG_CHARGER_LP8727=m +CONFIG_CHARGER_LP8788=m +CONFIG_CHARGER_GPIO=m +CONFIG_CHARGER_MANAGER=y +CONFIG_CHARGER_LT3651=m +CONFIG_CHARGER_MAX14577=m +CONFIG_CHARGER_DETECTOR_MAX14656=m +CONFIG_CHARGER_MAX77650=m +CONFIG_CHARGER_MAX77693=m +CONFIG_CHARGER_MAX8997=m +CONFIG_CHARGER_MAX8998=m +CONFIG_CHARGER_BQ2415X=m +CONFIG_CHARGER_BQ24190=m +CONFIG_CHARGER_BQ24257=m +CONFIG_CHARGER_BQ24735=m +CONFIG_CHARGER_BQ25890=m +CONFIG_CHARGER_SMB347=m +CONFIG_CHARGER_TPS65090=m +CONFIG_CHARGER_TPS65217=m +CONFIG_BATTERY_GAUGE_LTC2941=m +CONFIG_BATTERY_RT5033=m +CONFIG_CHARGER_RT9455=m +CONFIG_CHARGER_CROS_USBPD=m +CONFIG_CHARGER_UCS1002=m +CONFIG_CHARGER_BD70528=m +CONFIG_CHARGER_WILCO=m +CONFIG_HWMON=y +CONFIG_HWMON_VID=m +# CONFIG_HWMON_DEBUG_CHIP is not set + +# +# Native drivers +# +CONFIG_SENSORS_ABITUGURU=m +CONFIG_SENSORS_ABITUGURU3=m +CONFIG_SENSORS_AD7314=m +CONFIG_SENSORS_AD7414=m +CONFIG_SENSORS_AD7418=m +CONFIG_SENSORS_ADM1021=m +CONFIG_SENSORS_ADM1025=m +CONFIG_SENSORS_ADM1026=m +CONFIG_SENSORS_ADM1029=m +CONFIG_SENSORS_ADM1031=m +CONFIG_SENSORS_ADM9240=m +CONFIG_SENSORS_ADT7X10=m +CONFIG_SENSORS_ADT7310=m +CONFIG_SENSORS_ADT7410=m +CONFIG_SENSORS_ADT7411=m +CONFIG_SENSORS_ADT7462=m +CONFIG_SENSORS_ADT7470=m +CONFIG_SENSORS_ADT7475=m +CONFIG_SENSORS_AS370=m +CONFIG_SENSORS_ASC7621=m +CONFIG_SENSORS_K8TEMP=m +CONFIG_SENSORS_K10TEMP=m +CONFIG_SENSORS_FAM15H_POWER=m +CONFIG_SENSORS_APPLESMC=m +CONFIG_SENSORS_ASB100=m +CONFIG_SENSORS_ASPEED=m +CONFIG_SENSORS_ATXP1=m +CONFIG_SENSORS_DS620=m +CONFIG_SENSORS_DS1621=m +CONFIG_SENSORS_DELL_SMM=m +CONFIG_SENSORS_DA9052_ADC=m +CONFIG_SENSORS_DA9055=m +CONFIG_SENSORS_I5K_AMB=m +CONFIG_SENSORS_F71805F=m +CONFIG_SENSORS_F71882FG=m +CONFIG_SENSORS_F75375S=m +CONFIG_SENSORS_MC13783_ADC=m +CONFIG_SENSORS_FSCHMD=m +CONFIG_SENSORS_FTSTEUTATES=m +CONFIG_SENSORS_GL518SM=m +CONFIG_SENSORS_GL520SM=m +CONFIG_SENSORS_G760A=m +CONFIG_SENSORS_G762=m +CONFIG_SENSORS_GPIO_FAN=m +CONFIG_SENSORS_HIH6130=m +CONFIG_SENSORS_IBMAEM=m +CONFIG_SENSORS_IBMPEX=m +CONFIG_SENSORS_IIO_HWMON=m +CONFIG_SENSORS_I5500=m +CONFIG_SENSORS_CORETEMP=m +CONFIG_SENSORS_IT87=m +CONFIG_SENSORS_JC42=m +CONFIG_SENSORS_POWR1220=m +CONFIG_SENSORS_LINEAGE=m +CONFIG_SENSORS_LOCHNAGAR=m +CONFIG_SENSORS_LTC2945=m +CONFIG_SENSORS_LTC2947=m +CONFIG_SENSORS_LTC2947_I2C=m +CONFIG_SENSORS_LTC2947_SPI=m +CONFIG_SENSORS_LTC2990=m +CONFIG_SENSORS_LTC4151=m +CONFIG_SENSORS_LTC4215=m +CONFIG_SENSORS_LTC4222=m +CONFIG_SENSORS_LTC4245=m +CONFIG_SENSORS_LTC4260=m +CONFIG_SENSORS_LTC4261=m +CONFIG_SENSORS_MAX1111=m +CONFIG_SENSORS_MAX16065=m +CONFIG_SENSORS_MAX1619=m +CONFIG_SENSORS_MAX1668=m +CONFIG_SENSORS_MAX197=m +CONFIG_SENSORS_MAX31722=m +CONFIG_SENSORS_MAX6621=m +CONFIG_SENSORS_MAX6639=m +CONFIG_SENSORS_MAX6642=m +CONFIG_SENSORS_MAX6650=m +CONFIG_SENSORS_MAX6697=m +CONFIG_SENSORS_MAX31790=m +CONFIG_SENSORS_MCP3021=m +CONFIG_SENSORS_MLXREG_FAN=m +CONFIG_SENSORS_TC654=m +CONFIG_SENSORS_MENF21BMC_HWMON=m +CONFIG_SENSORS_ADCXX=m +CONFIG_SENSORS_LM63=m +CONFIG_SENSORS_LM70=m +CONFIG_SENSORS_LM73=m +CONFIG_SENSORS_LM75=m +CONFIG_SENSORS_LM77=m +CONFIG_SENSORS_LM78=m +CONFIG_SENSORS_LM80=m +CONFIG_SENSORS_LM83=m +CONFIG_SENSORS_LM85=m +CONFIG_SENSORS_LM87=m +CONFIG_SENSORS_LM90=m +CONFIG_SENSORS_LM92=m +CONFIG_SENSORS_LM93=m +CONFIG_SENSORS_LM95234=m +CONFIG_SENSORS_LM95241=m +CONFIG_SENSORS_LM95245=m +CONFIG_SENSORS_PC87360=m +CONFIG_SENSORS_PC87427=m +CONFIG_SENSORS_NTC_THERMISTOR=m +CONFIG_SENSORS_NCT6683=m +CONFIG_SENSORS_NCT6775=m +CONFIG_SENSORS_NCT7802=m +CONFIG_SENSORS_NCT7904=m +CONFIG_SENSORS_NPCM7XX=m +CONFIG_SENSORS_PCF8591=m +CONFIG_PMBUS=m +CONFIG_SENSORS_PMBUS=m +CONFIG_SENSORS_ADM1275=m +CONFIG_SENSORS_BEL_PFE=m +CONFIG_SENSORS_IBM_CFFPS=m +CONFIG_SENSORS_INSPUR_IPSPS=m +CONFIG_SENSORS_IR35221=m +CONFIG_SENSORS_IR38064=m +CONFIG_SENSORS_IRPS5401=m +CONFIG_SENSORS_ISL68137=m +CONFIG_SENSORS_LM25066=m +CONFIG_SENSORS_LTC2978=m +# CONFIG_SENSORS_LTC2978_REGULATOR is not set +CONFIG_SENSORS_LTC3815=m +CONFIG_SENSORS_MAX16064=m +CONFIG_SENSORS_MAX20751=m +CONFIG_SENSORS_MAX31785=m +CONFIG_SENSORS_MAX34440=m +CONFIG_SENSORS_MAX8688=m +CONFIG_SENSORS_PXE1610=m +CONFIG_SENSORS_TPS40422=m +CONFIG_SENSORS_TPS53679=m +CONFIG_SENSORS_UCD9000=m +CONFIG_SENSORS_UCD9200=m +CONFIG_SENSORS_ZL6100=m +CONFIG_SENSORS_PWM_FAN=m +CONFIG_SENSORS_SHT15=m +CONFIG_SENSORS_SHT21=m +CONFIG_SENSORS_SHT3x=m +CONFIG_SENSORS_SHTC1=m +CONFIG_SENSORS_SIS5595=m +CONFIG_SENSORS_DME1737=m +CONFIG_SENSORS_EMC1403=m +CONFIG_SENSORS_EMC2103=m +CONFIG_SENSORS_EMC6W201=m +CONFIG_SENSORS_SMSC47M1=m +CONFIG_SENSORS_SMSC47M192=m +CONFIG_SENSORS_SMSC47B397=m +CONFIG_SENSORS_SCH56XX_COMMON=m +CONFIG_SENSORS_SCH5627=m +CONFIG_SENSORS_SCH5636=m +CONFIG_SENSORS_STTS751=m +CONFIG_SENSORS_SMM665=m +CONFIG_SENSORS_ADC128D818=m +CONFIG_SENSORS_ADS7828=m +CONFIG_SENSORS_ADS7871=m +CONFIG_SENSORS_AMC6821=m +CONFIG_SENSORS_INA209=m +CONFIG_SENSORS_INA2XX=m +CONFIG_SENSORS_INA3221=m +CONFIG_SENSORS_TC74=m +CONFIG_SENSORS_THMC50=m +CONFIG_SENSORS_TMP102=m +CONFIG_SENSORS_TMP103=m +CONFIG_SENSORS_TMP108=m +CONFIG_SENSORS_TMP401=m +CONFIG_SENSORS_TMP421=m +CONFIG_SENSORS_TMP513=m +CONFIG_SENSORS_VIA_CPUTEMP=m +CONFIG_SENSORS_VIA686A=m +CONFIG_SENSORS_VT1211=m +CONFIG_SENSORS_VT8231=m +CONFIG_SENSORS_W83773G=m +CONFIG_SENSORS_W83781D=m +CONFIG_SENSORS_W83791D=m +CONFIG_SENSORS_W83792D=m +CONFIG_SENSORS_W83793=m +CONFIG_SENSORS_W83795=m +# CONFIG_SENSORS_W83795_FANCTRL is not set +CONFIG_SENSORS_W83L785TS=m +CONFIG_SENSORS_W83L786NG=m +CONFIG_SENSORS_W83627HF=m +CONFIG_SENSORS_W83627EHF=m +CONFIG_SENSORS_WM831X=m +CONFIG_SENSORS_WM8350=m +CONFIG_SENSORS_XGENE=m + +# +# ACPI drivers +# +CONFIG_SENSORS_ACPI_POWER=m +CONFIG_SENSORS_ATK0110=m +CONFIG_THERMAL=y +# CONFIG_THERMAL_STATISTICS is not set +CONFIG_THERMAL_EMERGENCY_POWEROFF_DELAY_MS=100 +CONFIG_THERMAL_HWMON=y +CONFIG_THERMAL_OF=y +CONFIG_THERMAL_WRITABLE_TRIPS=y +CONFIG_THERMAL_DEFAULT_GOV_STEP_WISE=y +# CONFIG_THERMAL_DEFAULT_GOV_FAIR_SHARE is not set +# CONFIG_THERMAL_DEFAULT_GOV_USER_SPACE is not set +# CONFIG_THERMAL_DEFAULT_GOV_POWER_ALLOCATOR is not set +CONFIG_THERMAL_GOV_FAIR_SHARE=y +CONFIG_THERMAL_GOV_STEP_WISE=y +CONFIG_THERMAL_GOV_BANG_BANG=y +CONFIG_THERMAL_GOV_USER_SPACE=y +CONFIG_THERMAL_GOV_POWER_ALLOCATOR=y +CONFIG_CPU_THERMAL=y +CONFIG_CLOCK_THERMAL=y +CONFIG_DEVFREQ_THERMAL=y +# CONFIG_THERMAL_EMULATION is not set +CONFIG_THERMAL_MMIO=m +CONFIG_MAX77620_THERMAL=m +CONFIG_QORIQ_THERMAL=m +CONFIG_DA9062_THERMAL=m + +# +# Intel thermal drivers +# +CONFIG_INTEL_POWERCLAMP=m +CONFIG_X86_PKG_TEMP_THERMAL=m +CONFIG_INTEL_SOC_DTS_IOSF_CORE=m +CONFIG_INTEL_SOC_DTS_THERMAL=m + +# +# ACPI INT340X thermal drivers +# +CONFIG_INT340X_THERMAL=m +CONFIG_ACPI_THERMAL_REL=m +CONFIG_INT3406_THERMAL=m +CONFIG_PROC_THERMAL_MMIO_RAPL=y +# end of ACPI INT340X thermal drivers + +CONFIG_INTEL_BXT_PMIC_THERMAL=m +CONFIG_INTEL_PCH_THERMAL=m +# end of Intel thermal drivers + +CONFIG_GENERIC_ADC_THERMAL=m +CONFIG_WATCHDOG=y +CONFIG_WATCHDOG_CORE=y +# CONFIG_WATCHDOG_NOWAYOUT is not set +CONFIG_WATCHDOG_HANDLE_BOOT_ENABLED=y +CONFIG_WATCHDOG_OPEN_TIMEOUT=0 +CONFIG_WATCHDOG_SYSFS=y + +# +# Watchdog Pretimeout Governors +# +CONFIG_WATCHDOG_PRETIMEOUT_GOV=y +CONFIG_WATCHDOG_PRETIMEOUT_GOV_SEL=m +CONFIG_WATCHDOG_PRETIMEOUT_GOV_NOOP=m +CONFIG_WATCHDOG_PRETIMEOUT_GOV_PANIC=y +# CONFIG_WATCHDOG_PRETIMEOUT_DEFAULT_GOV_NOOP is not set +CONFIG_WATCHDOG_PRETIMEOUT_DEFAULT_GOV_PANIC=y + +# +# Watchdog Device Drivers +# +CONFIG_SOFT_WATCHDOG=m +# CONFIG_SOFT_WATCHDOG_PRETIMEOUT is not set +CONFIG_BD70528_WATCHDOG=m +CONFIG_DA9052_WATCHDOG=m +CONFIG_DA9055_WATCHDOG=m +CONFIG_DA9063_WATCHDOG=m +CONFIG_DA9062_WATCHDOG=m +CONFIG_GPIO_WATCHDOG=m +CONFIG_MENF21BMC_WATCHDOG=m +CONFIG_MENZ069_WATCHDOG=m +CONFIG_WDAT_WDT=m +CONFIG_WM831X_WATCHDOG=m +CONFIG_WM8350_WATCHDOG=m +CONFIG_XILINX_WATCHDOG=m +CONFIG_ZIIRAVE_WATCHDOG=m +CONFIG_RAVE_SP_WATCHDOG=m +CONFIG_MLX_WDT=m +CONFIG_CADENCE_WATCHDOG=m +CONFIG_DW_WATCHDOG=m +CONFIG_RN5T618_WATCHDOG=m +CONFIG_TWL4030_WATCHDOG=m +CONFIG_MAX63XX_WATCHDOG=m +CONFIG_MAX77620_WATCHDOG=m +CONFIG_RETU_WATCHDOG=m +CONFIG_STPMIC1_WATCHDOG=m +CONFIG_ACQUIRE_WDT=m +CONFIG_ADVANTECH_WDT=m +CONFIG_ALIM1535_WDT=m +CONFIG_ALIM7101_WDT=m +CONFIG_EBC_C384_WDT=m +CONFIG_F71808E_WDT=m +CONFIG_SP5100_TCO=m +CONFIG_SBC_FITPC2_WATCHDOG=m +CONFIG_EUROTECH_WDT=m +CONFIG_IB700_WDT=m +CONFIG_IBMASR=m +CONFIG_WAFER_WDT=m +CONFIG_I6300ESB_WDT=m +CONFIG_IE6XX_WDT=m +CONFIG_ITCO_WDT=m +CONFIG_ITCO_VENDOR_SUPPORT=y +CONFIG_IT8712F_WDT=m +CONFIG_IT87_WDT=m +CONFIG_HP_WATCHDOG=m +CONFIG_HPWDT_NMI_DECODING=y +CONFIG_KEMPLD_WDT=m +CONFIG_SC1200_WDT=m +CONFIG_PC87413_WDT=m +CONFIG_NV_TCO=m +CONFIG_60XX_WDT=m +CONFIG_CPU5_WDT=m +CONFIG_SMSC_SCH311X_WDT=m +CONFIG_SMSC37B787_WDT=m +CONFIG_TQMX86_WDT=m +CONFIG_VIA_WDT=m +CONFIG_W83627HF_WDT=m +CONFIG_W83877F_WDT=m +CONFIG_W83977F_WDT=m +CONFIG_MACHZ_WDT=m +CONFIG_SBC_EPX_C3_WATCHDOG=m +CONFIG_INTEL_MEI_WDT=m +CONFIG_NI903X_WDT=m +CONFIG_NIC7018_WDT=m +CONFIG_MEN_A21_WDT=m +CONFIG_XEN_WDT=m + +# +# PCI-based Watchdog Cards +# +CONFIG_PCIPCWATCHDOG=m +CONFIG_WDTPCI=m + +# +# USB-based Watchdog Cards +# +CONFIG_USBPCWATCHDOG=m +CONFIG_SSB_POSSIBLE=y +CONFIG_SSB=m +CONFIG_SSB_SPROM=y +CONFIG_SSB_BLOCKIO=y +CONFIG_SSB_PCIHOST_POSSIBLE=y +CONFIG_SSB_PCIHOST=y +CONFIG_SSB_B43_PCI_BRIDGE=y +CONFIG_SSB_PCMCIAHOST_POSSIBLE=y +CONFIG_SSB_PCMCIAHOST=y +CONFIG_SSB_SDIOHOST_POSSIBLE=y +CONFIG_SSB_SDIOHOST=y +CONFIG_SSB_DRIVER_PCICORE_POSSIBLE=y +CONFIG_SSB_DRIVER_PCICORE=y +CONFIG_SSB_DRIVER_GPIO=y +CONFIG_BCMA_POSSIBLE=y +CONFIG_BCMA=m +CONFIG_BCMA_BLOCKIO=y +CONFIG_BCMA_HOST_PCI_POSSIBLE=y +CONFIG_BCMA_HOST_PCI=y +# CONFIG_BCMA_HOST_SOC is not set +CONFIG_BCMA_DRIVER_PCI=y +CONFIG_BCMA_DRIVER_GMAC_CMN=y +CONFIG_BCMA_DRIVER_GPIO=y +# CONFIG_BCMA_DEBUG is not set + +# +# Multifunction device drivers +# +CONFIG_MFD_CORE=y +CONFIG_MFD_ACT8945A=m +CONFIG_MFD_AS3711=y +CONFIG_MFD_AS3722=m +CONFIG_PMIC_ADP5520=y +CONFIG_MFD_AAT2870_CORE=y +CONFIG_MFD_ATMEL_FLEXCOM=m +CONFIG_MFD_ATMEL_HLCDC=m +CONFIG_MFD_BCM590XX=m +CONFIG_MFD_BD9571MWV=m +CONFIG_MFD_AXP20X=m +CONFIG_MFD_AXP20X_I2C=m +CONFIG_MFD_CROS_EC_DEV=m +CONFIG_MFD_MADERA=m +CONFIG_MFD_MADERA_I2C=m +CONFIG_MFD_MADERA_SPI=m +CONFIG_MFD_CS47L15=y +CONFIG_MFD_CS47L35=y +CONFIG_MFD_CS47L85=y +CONFIG_MFD_CS47L90=y +CONFIG_MFD_CS47L92=y +CONFIG_PMIC_DA903X=y +CONFIG_PMIC_DA9052=y +CONFIG_MFD_DA9052_SPI=y +CONFIG_MFD_DA9052_I2C=y +CONFIG_MFD_DA9055=y +CONFIG_MFD_DA9062=m +CONFIG_MFD_DA9063=m +CONFIG_MFD_DA9150=m +CONFIG_MFD_DLN2=m +CONFIG_MFD_MC13XXX=m +CONFIG_MFD_MC13XXX_SPI=m +CONFIG_MFD_MC13XXX_I2C=m +CONFIG_MFD_HI6421_PMIC=m +CONFIG_HTC_PASIC3=m +CONFIG_HTC_I2CPLD=y +CONFIG_MFD_INTEL_QUARK_I2C_GPIO=m +CONFIG_LPC_ICH=m +CONFIG_LPC_SCH=m +CONFIG_INTEL_SOC_PMIC=y +CONFIG_INTEL_SOC_PMIC_BXTWC=m +CONFIG_INTEL_SOC_PMIC_CHTWC=y +CONFIG_INTEL_SOC_PMIC_CHTDC_TI=m +CONFIG_MFD_INTEL_LPSS=m +CONFIG_MFD_INTEL_LPSS_ACPI=m +CONFIG_MFD_INTEL_LPSS_PCI=m +CONFIG_MFD_JANZ_CMODIO=m +CONFIG_MFD_KEMPLD=m +CONFIG_MFD_88PM800=m +CONFIG_MFD_88PM805=m +CONFIG_MFD_88PM860X=y +CONFIG_MFD_MAX14577=m +CONFIG_MFD_MAX77620=y +CONFIG_MFD_MAX77650=m +CONFIG_MFD_MAX77686=m +CONFIG_MFD_MAX77693=m +CONFIG_MFD_MAX77843=y +CONFIG_MFD_MAX8907=m +CONFIG_MFD_MAX8925=y +CONFIG_MFD_MAX8997=y +CONFIG_MFD_MAX8998=y +CONFIG_MFD_MT6397=m +CONFIG_MFD_MENF21BMC=m +CONFIG_EZX_PCAP=y +CONFIG_MFD_CPCAP=m +CONFIG_MFD_VIPERBOARD=m +CONFIG_MFD_RETU=m +CONFIG_MFD_PCF50633=m +CONFIG_PCF50633_ADC=m +CONFIG_PCF50633_GPIO=m +CONFIG_UCB1400_CORE=m +CONFIG_MFD_RDC321X=m +CONFIG_MFD_RT5033=m +CONFIG_MFD_RC5T583=y +CONFIG_MFD_RK808=m +CONFIG_MFD_RN5T618=m +CONFIG_MFD_SEC_CORE=y +CONFIG_MFD_SI476X_CORE=m +CONFIG_MFD_SM501=m +CONFIG_MFD_SM501_GPIO=y +CONFIG_MFD_SKY81452=m +CONFIG_MFD_SMSC=y +CONFIG_ABX500_CORE=y +CONFIG_AB3100_CORE=y +CONFIG_AB3100_OTP=y +CONFIG_MFD_STMPE=y + +# +# STMicroelectronics STMPE Interface Drivers +# +CONFIG_STMPE_I2C=y +CONFIG_STMPE_SPI=y +# end of STMicroelectronics STMPE Interface Drivers + +CONFIG_MFD_SYSCON=y +CONFIG_MFD_TI_AM335X_TSCADC=m +CONFIG_MFD_LP3943=m +CONFIG_MFD_LP8788=y +CONFIG_MFD_TI_LMU=m +CONFIG_MFD_PALMAS=y +CONFIG_TPS6105X=m +CONFIG_TPS65010=m +CONFIG_TPS6507X=m +CONFIG_MFD_TPS65086=m +CONFIG_MFD_TPS65090=y +CONFIG_MFD_TPS65217=m +CONFIG_MFD_TPS68470=y +CONFIG_MFD_TI_LP873X=m +CONFIG_MFD_TI_LP87565=m +CONFIG_MFD_TPS65218=m +CONFIG_MFD_TPS6586X=y +CONFIG_MFD_TPS65910=y +CONFIG_MFD_TPS65912=m +CONFIG_MFD_TPS65912_I2C=m +CONFIG_MFD_TPS65912_SPI=m +CONFIG_MFD_TPS80031=y +CONFIG_TWL4030_CORE=y +CONFIG_MFD_TWL4030_AUDIO=y +CONFIG_TWL6040_CORE=y +CONFIG_MFD_WL1273_CORE=m +CONFIG_MFD_LM3533=m +CONFIG_MFD_TC3589X=y +CONFIG_MFD_TQMX86=m +CONFIG_MFD_VX855=m +CONFIG_MFD_LOCHNAGAR=y +CONFIG_MFD_ARIZONA=y +CONFIG_MFD_ARIZONA_I2C=m +CONFIG_MFD_ARIZONA_SPI=m +CONFIG_MFD_CS47L24=y +CONFIG_MFD_WM5102=y +CONFIG_MFD_WM5110=y +CONFIG_MFD_WM8997=y +CONFIG_MFD_WM8998=y +CONFIG_MFD_WM8400=y +CONFIG_MFD_WM831X=y +CONFIG_MFD_WM831X_I2C=y +CONFIG_MFD_WM831X_SPI=y +CONFIG_MFD_WM8350=y +CONFIG_MFD_WM8350_I2C=y +CONFIG_MFD_WM8994=m +CONFIG_MFD_ROHM_BD718XX=m +CONFIG_MFD_ROHM_BD70528=m +CONFIG_MFD_STPMIC1=m +CONFIG_MFD_STMFX=m +CONFIG_RAVE_SP_CORE=m +# end of Multifunction device drivers + +CONFIG_REGULATOR=y +# CONFIG_REGULATOR_DEBUG is not set +CONFIG_REGULATOR_FIXED_VOLTAGE=m +CONFIG_REGULATOR_VIRTUAL_CONSUMER=m +CONFIG_REGULATOR_USERSPACE_CONSUMER=m +CONFIG_REGULATOR_88PG86X=m +CONFIG_REGULATOR_88PM800=m +CONFIG_REGULATOR_88PM8607=m +CONFIG_REGULATOR_ACT8865=m +CONFIG_REGULATOR_ACT8945A=m +CONFIG_REGULATOR_AD5398=m +CONFIG_REGULATOR_ANATOP=m +CONFIG_REGULATOR_AAT2870=m +CONFIG_REGULATOR_AB3100=m +CONFIG_REGULATOR_ARIZONA_LDO1=m +CONFIG_REGULATOR_ARIZONA_MICSUPP=m +CONFIG_REGULATOR_AS3711=m +CONFIG_REGULATOR_AS3722=m +CONFIG_REGULATOR_AXP20X=m +CONFIG_REGULATOR_BCM590XX=m +CONFIG_REGULATOR_BD70528=m +CONFIG_REGULATOR_BD718XX=m +CONFIG_REGULATOR_BD9571MWV=m +CONFIG_REGULATOR_CPCAP=m +CONFIG_REGULATOR_DA903X=m +CONFIG_REGULATOR_DA9052=m +CONFIG_REGULATOR_DA9055=m +CONFIG_REGULATOR_DA9062=m +CONFIG_REGULATOR_DA9063=m +CONFIG_REGULATOR_DA9210=m +CONFIG_REGULATOR_DA9211=m +CONFIG_REGULATOR_FAN53555=m +CONFIG_REGULATOR_GPIO=m +CONFIG_REGULATOR_HI6421=m +CONFIG_REGULATOR_HI6421V530=m +CONFIG_REGULATOR_ISL9305=m +CONFIG_REGULATOR_ISL6271A=m +CONFIG_REGULATOR_LM363X=m +CONFIG_REGULATOR_LOCHNAGAR=m +CONFIG_REGULATOR_LP3971=m +CONFIG_REGULATOR_LP3972=m +CONFIG_REGULATOR_LP872X=m +CONFIG_REGULATOR_LP873X=m +CONFIG_REGULATOR_LP8755=m +CONFIG_REGULATOR_LP87565=m +CONFIG_REGULATOR_LP8788=m +CONFIG_REGULATOR_LTC3589=m +CONFIG_REGULATOR_LTC3676=m +CONFIG_REGULATOR_MAX14577=m +CONFIG_REGULATOR_MAX1586=m +CONFIG_REGULATOR_MAX77620=m +CONFIG_REGULATOR_MAX77650=m +CONFIG_REGULATOR_MAX8649=m +CONFIG_REGULATOR_MAX8660=m +CONFIG_REGULATOR_MAX8907=m +CONFIG_REGULATOR_MAX8925=m +CONFIG_REGULATOR_MAX8952=m +CONFIG_REGULATOR_MAX8973=m +CONFIG_REGULATOR_MAX8997=m +CONFIG_REGULATOR_MAX8998=m +CONFIG_REGULATOR_MAX77686=m +CONFIG_REGULATOR_MAX77693=m +CONFIG_REGULATOR_MAX77802=m +CONFIG_REGULATOR_MC13XXX_CORE=m +CONFIG_REGULATOR_MC13783=m +CONFIG_REGULATOR_MC13892=m +CONFIG_REGULATOR_MCP16502=m +CONFIG_REGULATOR_MT6311=m +CONFIG_REGULATOR_MT6323=m +CONFIG_REGULATOR_MT6397=m +CONFIG_REGULATOR_PALMAS=m +CONFIG_REGULATOR_PCAP=m +CONFIG_REGULATOR_PCF50633=m +CONFIG_REGULATOR_PFUZE100=m +CONFIG_REGULATOR_PV88060=m +CONFIG_REGULATOR_PV88080=m +CONFIG_REGULATOR_PV88090=m +CONFIG_REGULATOR_PWM=m +CONFIG_REGULATOR_QCOM_SPMI=m +CONFIG_REGULATOR_RC5T583=m +CONFIG_REGULATOR_RK808=m +CONFIG_REGULATOR_RN5T618=m +CONFIG_REGULATOR_RT5033=m +CONFIG_REGULATOR_S2MPA01=m +CONFIG_REGULATOR_S2MPS11=m +CONFIG_REGULATOR_S5M8767=m +CONFIG_REGULATOR_SKY81452=m +CONFIG_REGULATOR_SLG51000=m +CONFIG_REGULATOR_STPMIC1=m +CONFIG_REGULATOR_SY8106A=m +CONFIG_REGULATOR_SY8824X=m +CONFIG_REGULATOR_TPS51632=m +CONFIG_REGULATOR_TPS6105X=m +CONFIG_REGULATOR_TPS62360=m +CONFIG_REGULATOR_TPS65023=m +CONFIG_REGULATOR_TPS6507X=m +CONFIG_REGULATOR_TPS65086=m +CONFIG_REGULATOR_TPS65090=m +CONFIG_REGULATOR_TPS65132=m +CONFIG_REGULATOR_TPS65217=m +CONFIG_REGULATOR_TPS65218=m +CONFIG_REGULATOR_TPS6524X=m +CONFIG_REGULATOR_TPS6586X=m +CONFIG_REGULATOR_TPS65910=m +CONFIG_REGULATOR_TPS65912=m +CONFIG_REGULATOR_TPS80031=m +CONFIG_REGULATOR_TWL4030=m +CONFIG_REGULATOR_VCTRL=m +CONFIG_REGULATOR_WM831X=m +CONFIG_REGULATOR_WM8350=m +CONFIG_REGULATOR_WM8400=m +CONFIG_REGULATOR_WM8994=m +CONFIG_CEC_CORE=y +CONFIG_CEC_NOTIFIER=y +CONFIG_CEC_PIN=y +CONFIG_RC_CORE=m +CONFIG_RC_MAP=m +CONFIG_LIRC=y +CONFIG_RC_DECODERS=y +CONFIG_IR_NEC_DECODER=m +CONFIG_IR_RC5_DECODER=m +CONFIG_IR_RC6_DECODER=m +CONFIG_IR_JVC_DECODER=m +CONFIG_IR_SONY_DECODER=m +CONFIG_IR_SANYO_DECODER=m +CONFIG_IR_SHARP_DECODER=m +CONFIG_IR_MCE_KBD_DECODER=m +CONFIG_IR_XMP_DECODER=m +CONFIG_IR_IMON_DECODER=m +CONFIG_IR_RCMM_DECODER=m +CONFIG_RC_DEVICES=y +CONFIG_RC_ATI_REMOTE=m +CONFIG_IR_ENE=m +CONFIG_IR_HIX5HD2=m +CONFIG_IR_IMON=m +CONFIG_IR_IMON_RAW=m +CONFIG_IR_MCEUSB=m +CONFIG_IR_ITE_CIR=m +CONFIG_IR_FINTEK=m +CONFIG_IR_NUVOTON=m +CONFIG_IR_REDRAT3=m +CONFIG_IR_SPI=m +CONFIG_IR_STREAMZAP=m +CONFIG_IR_WINBOND_CIR=m +CONFIG_IR_IGORPLUGUSB=m +CONFIG_IR_IGUANA=m +CONFIG_IR_TTUSBIR=m +CONFIG_RC_LOOPBACK=m +CONFIG_IR_GPIO_CIR=m +CONFIG_IR_GPIO_TX=m +CONFIG_IR_PWM_TX=m +CONFIG_IR_SERIAL=m +CONFIG_IR_SERIAL_TRANSMITTER=y +CONFIG_IR_SIR=m +CONFIG_RC_XBOX_DVD=m +CONFIG_MEDIA_SUPPORT=m + +# +# Multimedia core support +# +CONFIG_MEDIA_CAMERA_SUPPORT=y +CONFIG_MEDIA_ANALOG_TV_SUPPORT=y +CONFIG_MEDIA_DIGITAL_TV_SUPPORT=y +CONFIG_MEDIA_RADIO_SUPPORT=y +CONFIG_MEDIA_SDR_SUPPORT=y +CONFIG_MEDIA_CEC_SUPPORT=y +# CONFIG_CEC_PIN_ERROR_INJ is not set +CONFIG_MEDIA_CONTROLLER=y +CONFIG_MEDIA_CONTROLLER_DVB=y +# CONFIG_MEDIA_CONTROLLER_REQUEST_API is not set +CONFIG_VIDEO_DEV=m +CONFIG_VIDEO_V4L2_SUBDEV_API=y +CONFIG_VIDEO_V4L2=m +CONFIG_VIDEO_V4L2_I2C=y +# CONFIG_VIDEO_ADV_DEBUG is not set +# CONFIG_VIDEO_FIXED_MINOR_RANGES is not set +CONFIG_VIDEO_TUNER=m +CONFIG_V4L2_MEM2MEM_DEV=m +CONFIG_V4L2_FLASH_LED_CLASS=m +CONFIG_V4L2_FWNODE=m +CONFIG_VIDEOBUF_GEN=m +CONFIG_VIDEOBUF_DMA_SG=m +CONFIG_VIDEOBUF_VMALLOC=m +CONFIG_DVB_CORE=m +CONFIG_DVB_MMAP=y +CONFIG_DVB_NET=y +CONFIG_TTPCI_EEPROM=m +CONFIG_DVB_MAX_ADAPTERS=16 +# CONFIG_DVB_DYNAMIC_MINORS is not set +# CONFIG_DVB_DEMUX_SECTION_LOSS_LOG is not set +# CONFIG_DVB_ULE_DEBUG is not set + +# +# Media drivers +# +CONFIG_MEDIA_USB_SUPPORT=y + +# +# Webcam devices +# +CONFIG_USB_VIDEO_CLASS=m +CONFIG_USB_VIDEO_CLASS_INPUT_EVDEV=y +CONFIG_USB_GSPCA=m +CONFIG_USB_M5602=m +CONFIG_USB_STV06XX=m +CONFIG_USB_GL860=m +CONFIG_USB_GSPCA_BENQ=m +CONFIG_USB_GSPCA_CONEX=m +CONFIG_USB_GSPCA_CPIA1=m +CONFIG_USB_GSPCA_DTCS033=m +CONFIG_USB_GSPCA_ETOMS=m +CONFIG_USB_GSPCA_FINEPIX=m +CONFIG_USB_GSPCA_JEILINJ=m +CONFIG_USB_GSPCA_JL2005BCD=m +CONFIG_USB_GSPCA_KINECT=m +CONFIG_USB_GSPCA_KONICA=m +CONFIG_USB_GSPCA_MARS=m +CONFIG_USB_GSPCA_MR97310A=m +CONFIG_USB_GSPCA_NW80X=m +CONFIG_USB_GSPCA_OV519=m +CONFIG_USB_GSPCA_OV534=m +CONFIG_USB_GSPCA_OV534_9=m +CONFIG_USB_GSPCA_PAC207=m +CONFIG_USB_GSPCA_PAC7302=m +CONFIG_USB_GSPCA_PAC7311=m +CONFIG_USB_GSPCA_SE401=m +CONFIG_USB_GSPCA_SN9C2028=m +CONFIG_USB_GSPCA_SN9C20X=m +CONFIG_USB_GSPCA_SONIXB=m +CONFIG_USB_GSPCA_SONIXJ=m +CONFIG_USB_GSPCA_SPCA500=m +CONFIG_USB_GSPCA_SPCA501=m +CONFIG_USB_GSPCA_SPCA505=m +CONFIG_USB_GSPCA_SPCA506=m +CONFIG_USB_GSPCA_SPCA508=m +CONFIG_USB_GSPCA_SPCA561=m +CONFIG_USB_GSPCA_SPCA1528=m +CONFIG_USB_GSPCA_SQ905=m +CONFIG_USB_GSPCA_SQ905C=m +CONFIG_USB_GSPCA_SQ930X=m +CONFIG_USB_GSPCA_STK014=m +CONFIG_USB_GSPCA_STK1135=m +CONFIG_USB_GSPCA_STV0680=m +CONFIG_USB_GSPCA_SUNPLUS=m +CONFIG_USB_GSPCA_T613=m +CONFIG_USB_GSPCA_TOPRO=m +CONFIG_USB_GSPCA_TOUPTEK=m +CONFIG_USB_GSPCA_TV8532=m +CONFIG_USB_GSPCA_VC032X=m +CONFIG_USB_GSPCA_VICAM=m +CONFIG_USB_GSPCA_XIRLINK_CIT=m +CONFIG_USB_GSPCA_ZC3XX=m +CONFIG_USB_PWC=m +# CONFIG_USB_PWC_DEBUG is not set +CONFIG_USB_PWC_INPUT_EVDEV=y +CONFIG_VIDEO_CPIA2=m +CONFIG_USB_ZR364XX=m +CONFIG_USB_STKWEBCAM=m +CONFIG_USB_S2255=m +CONFIG_VIDEO_USBTV=m + +# +# Analog TV USB devices +# +CONFIG_VIDEO_PVRUSB2=m +CONFIG_VIDEO_PVRUSB2_SYSFS=y +CONFIG_VIDEO_PVRUSB2_DVB=y +# CONFIG_VIDEO_PVRUSB2_DEBUGIFC is not set +CONFIG_VIDEO_HDPVR=m +CONFIG_VIDEO_USBVISION=m +CONFIG_VIDEO_STK1160_COMMON=m +CONFIG_VIDEO_STK1160=m +CONFIG_VIDEO_GO7007=m +CONFIG_VIDEO_GO7007_USB=m +CONFIG_VIDEO_GO7007_LOADER=m +CONFIG_VIDEO_GO7007_USB_S2250_BOARD=m + +# +# Analog/digital TV USB devices +# +CONFIG_VIDEO_AU0828=m +CONFIG_VIDEO_AU0828_V4L2=y +CONFIG_VIDEO_AU0828_RC=y +CONFIG_VIDEO_CX231XX=m +CONFIG_VIDEO_CX231XX_RC=y +CONFIG_VIDEO_CX231XX_ALSA=m +CONFIG_VIDEO_CX231XX_DVB=m +CONFIG_VIDEO_TM6000=m +CONFIG_VIDEO_TM6000_ALSA=m +CONFIG_VIDEO_TM6000_DVB=m + +# +# Digital TV USB devices +# +CONFIG_DVB_USB=m +# CONFIG_DVB_USB_DEBUG is not set +CONFIG_DVB_USB_DIB3000MC=m +CONFIG_DVB_USB_A800=m +CONFIG_DVB_USB_DIBUSB_MB=m +CONFIG_DVB_USB_DIBUSB_MB_FAULTY=y +CONFIG_DVB_USB_DIBUSB_MC=m +CONFIG_DVB_USB_DIB0700=m +CONFIG_DVB_USB_UMT_010=m +CONFIG_DVB_USB_CXUSB=m +CONFIG_DVB_USB_CXUSB_ANALOG=y +CONFIG_DVB_USB_M920X=m +CONFIG_DVB_USB_DIGITV=m +CONFIG_DVB_USB_VP7045=m +CONFIG_DVB_USB_VP702X=m +CONFIG_DVB_USB_GP8PSK=m +CONFIG_DVB_USB_NOVA_T_USB2=m +CONFIG_DVB_USB_TTUSB2=m +CONFIG_DVB_USB_DTT200U=m +CONFIG_DVB_USB_OPERA1=m +CONFIG_DVB_USB_AF9005=m +CONFIG_DVB_USB_AF9005_REMOTE=m +CONFIG_DVB_USB_PCTV452E=m +CONFIG_DVB_USB_DW2102=m +CONFIG_DVB_USB_CINERGY_T2=m +CONFIG_DVB_USB_DTV5100=m +CONFIG_DVB_USB_AZ6027=m +CONFIG_DVB_USB_TECHNISAT_USB2=m +CONFIG_DVB_USB_V2=m +CONFIG_DVB_USB_AF9015=m +CONFIG_DVB_USB_AF9035=m +CONFIG_DVB_USB_ANYSEE=m +CONFIG_DVB_USB_AU6610=m +CONFIG_DVB_USB_AZ6007=m +CONFIG_DVB_USB_CE6230=m +CONFIG_DVB_USB_EC168=m +CONFIG_DVB_USB_GL861=m +CONFIG_DVB_USB_LME2510=m +CONFIG_DVB_USB_MXL111SF=m +CONFIG_DVB_USB_RTL28XXU=m +CONFIG_DVB_USB_DVBSKY=m +CONFIG_DVB_USB_ZD1301=m +CONFIG_DVB_TTUSB_BUDGET=m +CONFIG_DVB_TTUSB_DEC=m +CONFIG_SMS_USB_DRV=m +CONFIG_DVB_B2C2_FLEXCOP_USB=m +# CONFIG_DVB_B2C2_FLEXCOP_USB_DEBUG is not set +CONFIG_DVB_AS102=m + +# +# Webcam, TV (analog/digital) USB devices +# +CONFIG_VIDEO_EM28XX=m +CONFIG_VIDEO_EM28XX_V4L2=m +CONFIG_VIDEO_EM28XX_ALSA=m +CONFIG_VIDEO_EM28XX_DVB=m +CONFIG_VIDEO_EM28XX_RC=m + +# +# Software defined radio USB devices +# +CONFIG_USB_AIRSPY=m +CONFIG_USB_HACKRF=m +CONFIG_USB_MSI2500=m + +# +# USB HDMI CEC adapters +# +CONFIG_USB_PULSE8_CEC=m +CONFIG_USB_RAINSHADOW_CEC=m +CONFIG_MEDIA_PCI_SUPPORT=y + +# +# Media capture support +# +CONFIG_VIDEO_MEYE=m +CONFIG_VIDEO_SOLO6X10=m +CONFIG_VIDEO_TW5864=m +CONFIG_VIDEO_TW68=m +CONFIG_VIDEO_TW686X=m + +# +# Media capture/analog TV support +# +CONFIG_VIDEO_IVTV=m +# CONFIG_VIDEO_IVTV_DEPRECATED_IOCTLS is not set +CONFIG_VIDEO_IVTV_ALSA=m +CONFIG_VIDEO_FB_IVTV=m +# CONFIG_VIDEO_FB_IVTV_FORCE_PAT is not set +CONFIG_VIDEO_HEXIUM_GEMINI=m +CONFIG_VIDEO_HEXIUM_ORION=m +CONFIG_VIDEO_MXB=m +CONFIG_VIDEO_DT3155=m + +# +# Media capture/analog/hybrid TV support +# +CONFIG_VIDEO_CX18=m +CONFIG_VIDEO_CX18_ALSA=m +CONFIG_VIDEO_CX23885=m +CONFIG_MEDIA_ALTERA_CI=m +CONFIG_VIDEO_CX25821=m +CONFIG_VIDEO_CX25821_ALSA=m +CONFIG_VIDEO_CX88=m +CONFIG_VIDEO_CX88_ALSA=m +CONFIG_VIDEO_CX88_BLACKBIRD=m +CONFIG_VIDEO_CX88_DVB=m +CONFIG_VIDEO_CX88_ENABLE_VP3054=y +CONFIG_VIDEO_CX88_VP3054=m +CONFIG_VIDEO_CX88_MPEG=m +CONFIG_VIDEO_BT848=m +CONFIG_DVB_BT8XX=m +CONFIG_VIDEO_SAA7134=m +CONFIG_VIDEO_SAA7134_ALSA=m +CONFIG_VIDEO_SAA7134_RC=y +CONFIG_VIDEO_SAA7134_DVB=m +CONFIG_VIDEO_SAA7134_GO7007=m +CONFIG_VIDEO_SAA7164=m + +# +# Media digital TV PCI Adapters +# +CONFIG_DVB_AV7110_IR=y +CONFIG_DVB_AV7110=m +CONFIG_DVB_AV7110_OSD=y +CONFIG_DVB_BUDGET_CORE=m +CONFIG_DVB_BUDGET=m +CONFIG_DVB_BUDGET_CI=m +CONFIG_DVB_BUDGET_AV=m +CONFIG_DVB_BUDGET_PATCH=m +CONFIG_DVB_B2C2_FLEXCOP_PCI=m +# CONFIG_DVB_B2C2_FLEXCOP_PCI_DEBUG is not set +CONFIG_DVB_PLUTO2=m +CONFIG_DVB_DM1105=m +CONFIG_DVB_PT1=m +CONFIG_DVB_PT3=m +CONFIG_MANTIS_CORE=m +CONFIG_DVB_MANTIS=m +CONFIG_DVB_HOPPER=m +CONFIG_DVB_NGENE=m +CONFIG_DVB_DDBRIDGE=m +# CONFIG_DVB_DDBRIDGE_MSIENABLE is not set +CONFIG_DVB_SMIPCIE=m +CONFIG_DVB_NETUP_UNIDVB=m +CONFIG_VIDEO_IPU3_CIO2=m +CONFIG_V4L_PLATFORM_DRIVERS=y +CONFIG_VIDEO_CAFE_CCIC=m +CONFIG_VIDEO_CADENCE=y +CONFIG_VIDEO_CADENCE_CSI2RX=m +CONFIG_VIDEO_CADENCE_CSI2TX=m +CONFIG_VIDEO_ASPEED=m +CONFIG_VIDEO_MUX=m +CONFIG_VIDEO_XILINX=m +CONFIG_VIDEO_XILINX_TPG=m +CONFIG_VIDEO_XILINX_VTC=m +CONFIG_V4L_MEM2MEM_DRIVERS=y +CONFIG_VIDEO_MEM2MEM_DEINTERLACE=m +CONFIG_VIDEO_SH_VEU=m +CONFIG_V4L_TEST_DRIVERS=y +CONFIG_VIDEO_VIMC=m +CONFIG_VIDEO_VIVID=m +CONFIG_VIDEO_VIVID_CEC=y +CONFIG_VIDEO_VIVID_MAX_DEVS=64 +CONFIG_VIDEO_VIM2M=m +CONFIG_VIDEO_VICODEC=m +CONFIG_DVB_PLATFORM_DRIVERS=y +CONFIG_CEC_PLATFORM_DRIVERS=y +CONFIG_VIDEO_CROS_EC_CEC=m +CONFIG_CEC_GPIO=m +CONFIG_VIDEO_SECO_CEC=m +CONFIG_VIDEO_SECO_RC=y +CONFIG_SDR_PLATFORM_DRIVERS=y + +# +# Supported MMC/SDIO adapters +# +CONFIG_SMS_SDIO_DRV=m +CONFIG_RADIO_ADAPTERS=y +CONFIG_RADIO_TEA575X=m +CONFIG_RADIO_SI470X=m +CONFIG_USB_SI470X=m +CONFIG_I2C_SI470X=m +CONFIG_RADIO_SI4713=m +CONFIG_USB_SI4713=m +CONFIG_PLATFORM_SI4713=m +CONFIG_I2C_SI4713=m +CONFIG_RADIO_SI476X=m +CONFIG_USB_MR800=m +CONFIG_USB_DSBR=m +CONFIG_RADIO_MAXIRADIO=m +CONFIG_RADIO_SHARK=m +CONFIG_RADIO_SHARK2=m +CONFIG_USB_KEENE=m +CONFIG_USB_RAREMONO=m +CONFIG_USB_MA901=m +CONFIG_RADIO_TEA5764=m +CONFIG_RADIO_SAA7706H=m +CONFIG_RADIO_TEF6862=m +CONFIG_RADIO_WL1273=m + +# +# Texas Instruments WL128x FM driver (ST based) +# +CONFIG_RADIO_WL128X=m +# end of Texas Instruments WL128x FM driver (ST based) + +# +# Supported FireWire (IEEE 1394) Adapters +# +CONFIG_DVB_FIREDTV=m +CONFIG_DVB_FIREDTV_INPUT=y +CONFIG_MEDIA_COMMON_OPTIONS=y + +# +# common driver options +# +CONFIG_VIDEO_CX2341X=m +CONFIG_VIDEO_TVEEPROM=m +CONFIG_CYPRESS_FIRMWARE=m +CONFIG_VIDEOBUF2_CORE=m +CONFIG_VIDEOBUF2_V4L2=m +CONFIG_VIDEOBUF2_MEMOPS=m +CONFIG_VIDEOBUF2_DMA_CONTIG=m +CONFIG_VIDEOBUF2_VMALLOC=m +CONFIG_VIDEOBUF2_DMA_SG=m +CONFIG_VIDEOBUF2_DVB=m +CONFIG_DVB_B2C2_FLEXCOP=m +CONFIG_VIDEO_SAA7146=m +CONFIG_VIDEO_SAA7146_VV=m +CONFIG_SMS_SIANO_MDTV=m +CONFIG_SMS_SIANO_RC=y +# CONFIG_SMS_SIANO_DEBUGFS is not set +CONFIG_VIDEO_V4L2_TPG=m + +# +# Media ancillary drivers (tuners, sensors, i2c, spi, frontends) +# +CONFIG_MEDIA_SUBDRV_AUTOSELECT=y +CONFIG_MEDIA_ATTACH=y +CONFIG_VIDEO_IR_I2C=m + +# +# I2C Encoders, decoders, sensors and other helper chips +# + +# +# Audio decoders, processors and mixers +# +CONFIG_VIDEO_TVAUDIO=m +CONFIG_VIDEO_TDA7432=m +CONFIG_VIDEO_TDA9840=m +CONFIG_VIDEO_TDA1997X=m +CONFIG_VIDEO_TEA6415C=m +CONFIG_VIDEO_TEA6420=m +CONFIG_VIDEO_MSP3400=m +CONFIG_VIDEO_CS3308=m +CONFIG_VIDEO_CS5345=m +CONFIG_VIDEO_CS53L32A=m +CONFIG_VIDEO_TLV320AIC23B=m +CONFIG_VIDEO_UDA1342=m +CONFIG_VIDEO_WM8775=m +CONFIG_VIDEO_WM8739=m +CONFIG_VIDEO_VP27SMPX=m +CONFIG_VIDEO_SONY_BTF_MPX=m + +# +# RDS decoders +# +CONFIG_VIDEO_SAA6588=m + +# +# Video decoders +# +CONFIG_VIDEO_ADV7180=m +CONFIG_VIDEO_ADV7183=m +CONFIG_VIDEO_ADV748X=m +CONFIG_VIDEO_ADV7604=m +CONFIG_VIDEO_ADV7604_CEC=y +CONFIG_VIDEO_ADV7842=m +CONFIG_VIDEO_ADV7842_CEC=y +CONFIG_VIDEO_BT819=m +CONFIG_VIDEO_BT856=m +CONFIG_VIDEO_BT866=m +CONFIG_VIDEO_KS0127=m +CONFIG_VIDEO_ML86V7667=m +CONFIG_VIDEO_SAA7110=m +CONFIG_VIDEO_SAA711X=m +CONFIG_VIDEO_TC358743=m +CONFIG_VIDEO_TC358743_CEC=y +CONFIG_VIDEO_TVP514X=m +CONFIG_VIDEO_TVP5150=m +CONFIG_VIDEO_TVP7002=m +CONFIG_VIDEO_TW2804=m +CONFIG_VIDEO_TW9903=m +CONFIG_VIDEO_TW9906=m +CONFIG_VIDEO_TW9910=m +CONFIG_VIDEO_VPX3220=m + +# +# Video and audio decoders +# +CONFIG_VIDEO_SAA717X=m +CONFIG_VIDEO_CX25840=m + +# +# Video encoders +# +CONFIG_VIDEO_SAA7127=m +CONFIG_VIDEO_SAA7185=m +CONFIG_VIDEO_ADV7170=m +CONFIG_VIDEO_ADV7175=m +CONFIG_VIDEO_ADV7343=m +CONFIG_VIDEO_ADV7393=m +CONFIG_VIDEO_AD9389B=m +CONFIG_VIDEO_AK881X=m +CONFIG_VIDEO_THS8200=m + +# +# Camera sensor devices +# +CONFIG_VIDEO_APTINA_PLL=m +CONFIG_VIDEO_SMIAPP_PLL=m +CONFIG_VIDEO_HI556=m +CONFIG_VIDEO_IMX214=m +CONFIG_VIDEO_IMX258=m +CONFIG_VIDEO_IMX274=m +CONFIG_VIDEO_IMX290=m +CONFIG_VIDEO_IMX319=m +CONFIG_VIDEO_IMX355=m +CONFIG_VIDEO_OV2640=m +CONFIG_VIDEO_OV2659=m +CONFIG_VIDEO_OV2680=m +CONFIG_VIDEO_OV2685=m +CONFIG_VIDEO_OV5640=m +CONFIG_VIDEO_OV5645=m +CONFIG_VIDEO_OV5647=m +CONFIG_VIDEO_OV6650=m +CONFIG_VIDEO_OV5670=m +CONFIG_VIDEO_OV5675=m +CONFIG_VIDEO_OV5695=m +CONFIG_VIDEO_OV7251=m +CONFIG_VIDEO_OV772X=m +CONFIG_VIDEO_OV7640=m +CONFIG_VIDEO_OV7670=m +CONFIG_VIDEO_OV7740=m +CONFIG_VIDEO_OV8856=m +CONFIG_VIDEO_OV9640=m +CONFIG_VIDEO_OV9650=m +CONFIG_VIDEO_OV13858=m +CONFIG_VIDEO_VS6624=m +CONFIG_VIDEO_MT9M001=m +CONFIG_VIDEO_MT9M032=m +CONFIG_VIDEO_MT9M111=m +CONFIG_VIDEO_MT9P031=m +CONFIG_VIDEO_MT9T001=m +CONFIG_VIDEO_MT9T112=m +CONFIG_VIDEO_MT9V011=m +CONFIG_VIDEO_MT9V032=m +CONFIG_VIDEO_MT9V111=m +CONFIG_VIDEO_SR030PC30=m +CONFIG_VIDEO_NOON010PC30=m +CONFIG_VIDEO_M5MOLS=m +CONFIG_VIDEO_RJ54N1=m +CONFIG_VIDEO_S5K6AA=m +CONFIG_VIDEO_S5K6A3=m +CONFIG_VIDEO_S5K4ECGX=m +CONFIG_VIDEO_S5K5BAF=m +CONFIG_VIDEO_SMIAPP=m +CONFIG_VIDEO_ET8EK8=m +CONFIG_VIDEO_S5C73M3=m + +# +# Lens drivers +# +CONFIG_VIDEO_AD5820=m +CONFIG_VIDEO_AK7375=m +CONFIG_VIDEO_DW9714=m +CONFIG_VIDEO_DW9807_VCM=m + +# +# Flash devices +# +CONFIG_VIDEO_ADP1653=m +CONFIG_VIDEO_LM3560=m +CONFIG_VIDEO_LM3646=m + +# +# Video improvement chips +# +CONFIG_VIDEO_UPD64031A=m +CONFIG_VIDEO_UPD64083=m + +# +# Audio/Video compression chips +# +CONFIG_VIDEO_SAA6752HS=m + +# +# SDR tuner chips +# +CONFIG_SDR_MAX2175=m + +# +# Miscellaneous helper chips +# +CONFIG_VIDEO_THS7303=m +CONFIG_VIDEO_M52790=m +CONFIG_VIDEO_I2C=m +CONFIG_VIDEO_ST_MIPID02=m +# end of I2C Encoders, decoders, sensors and other helper chips + +# +# SPI helper chips +# +CONFIG_VIDEO_GS1662=m +# end of SPI helper chips + +# +# Media SPI Adapters +# +CONFIG_CXD2880_SPI_DRV=m +# end of Media SPI Adapters + +CONFIG_MEDIA_TUNER=m + +# +# Customize TV tuners +# +CONFIG_MEDIA_TUNER_SIMPLE=m +CONFIG_MEDIA_TUNER_TDA18250=m +CONFIG_MEDIA_TUNER_TDA8290=m +CONFIG_MEDIA_TUNER_TDA827X=m +CONFIG_MEDIA_TUNER_TDA18271=m +CONFIG_MEDIA_TUNER_TDA9887=m +CONFIG_MEDIA_TUNER_TEA5761=m +CONFIG_MEDIA_TUNER_TEA5767=m +CONFIG_MEDIA_TUNER_MSI001=m +CONFIG_MEDIA_TUNER_MT20XX=m +CONFIG_MEDIA_TUNER_MT2060=m +CONFIG_MEDIA_TUNER_MT2063=m +CONFIG_MEDIA_TUNER_MT2266=m +CONFIG_MEDIA_TUNER_MT2131=m +CONFIG_MEDIA_TUNER_QT1010=m +CONFIG_MEDIA_TUNER_XC2028=m +CONFIG_MEDIA_TUNER_XC5000=m +CONFIG_MEDIA_TUNER_XC4000=m +CONFIG_MEDIA_TUNER_MXL5005S=m +CONFIG_MEDIA_TUNER_MXL5007T=m +CONFIG_MEDIA_TUNER_MC44S803=m +CONFIG_MEDIA_TUNER_MAX2165=m +CONFIG_MEDIA_TUNER_TDA18218=m +CONFIG_MEDIA_TUNER_FC0011=m +CONFIG_MEDIA_TUNER_FC0012=m +CONFIG_MEDIA_TUNER_FC0013=m +CONFIG_MEDIA_TUNER_TDA18212=m +CONFIG_MEDIA_TUNER_E4000=m +CONFIG_MEDIA_TUNER_FC2580=m +CONFIG_MEDIA_TUNER_M88RS6000T=m +CONFIG_MEDIA_TUNER_TUA9001=m +CONFIG_MEDIA_TUNER_SI2157=m +CONFIG_MEDIA_TUNER_IT913X=m +CONFIG_MEDIA_TUNER_R820T=m +CONFIG_MEDIA_TUNER_MXL301RF=m +CONFIG_MEDIA_TUNER_QM1D1C0042=m +CONFIG_MEDIA_TUNER_QM1D1B0004=m +# end of Customize TV tuners + +# +# Customise DVB Frontends +# + +# +# Multistandard (satellite) frontends +# +CONFIG_DVB_STB0899=m +CONFIG_DVB_STB6100=m +CONFIG_DVB_STV090x=m +CONFIG_DVB_STV0910=m +CONFIG_DVB_STV6110x=m +CONFIG_DVB_STV6111=m +CONFIG_DVB_MXL5XX=m +CONFIG_DVB_M88DS3103=m + +# +# Multistandard (cable + terrestrial) frontends +# +CONFIG_DVB_DRXK=m +CONFIG_DVB_TDA18271C2DD=m +CONFIG_DVB_SI2165=m +CONFIG_DVB_MN88472=m +CONFIG_DVB_MN88473=m + +# +# DVB-S (satellite) frontends +# +CONFIG_DVB_CX24110=m +CONFIG_DVB_CX24123=m +CONFIG_DVB_MT312=m +CONFIG_DVB_ZL10036=m +CONFIG_DVB_ZL10039=m +CONFIG_DVB_S5H1420=m +CONFIG_DVB_STV0288=m +CONFIG_DVB_STB6000=m +CONFIG_DVB_STV0299=m +CONFIG_DVB_STV6110=m +CONFIG_DVB_STV0900=m +CONFIG_DVB_TDA8083=m +CONFIG_DVB_TDA10086=m +CONFIG_DVB_TDA8261=m +CONFIG_DVB_VES1X93=m +CONFIG_DVB_TUNER_ITD1000=m +CONFIG_DVB_TUNER_CX24113=m +CONFIG_DVB_TDA826X=m +CONFIG_DVB_TUA6100=m +CONFIG_DVB_CX24116=m +CONFIG_DVB_CX24117=m +CONFIG_DVB_CX24120=m +CONFIG_DVB_SI21XX=m +CONFIG_DVB_TS2020=m +CONFIG_DVB_DS3000=m +CONFIG_DVB_MB86A16=m +CONFIG_DVB_TDA10071=m + +# +# DVB-T (terrestrial) frontends +# +CONFIG_DVB_SP8870=m +CONFIG_DVB_SP887X=m +CONFIG_DVB_CX22700=m +CONFIG_DVB_CX22702=m +CONFIG_DVB_S5H1432=m +CONFIG_DVB_DRXD=m +CONFIG_DVB_L64781=m +CONFIG_DVB_TDA1004X=m +CONFIG_DVB_NXT6000=m +CONFIG_DVB_MT352=m +CONFIG_DVB_ZL10353=m +CONFIG_DVB_DIB3000MB=m +CONFIG_DVB_DIB3000MC=m +CONFIG_DVB_DIB7000M=m +CONFIG_DVB_DIB7000P=m +CONFIG_DVB_DIB9000=m +CONFIG_DVB_TDA10048=m +CONFIG_DVB_AF9013=m +CONFIG_DVB_EC100=m +CONFIG_DVB_STV0367=m +CONFIG_DVB_CXD2820R=m +CONFIG_DVB_CXD2841ER=m +CONFIG_DVB_RTL2830=m +CONFIG_DVB_RTL2832=m +CONFIG_DVB_RTL2832_SDR=m +CONFIG_DVB_SI2168=m +CONFIG_DVB_AS102_FE=m +CONFIG_DVB_ZD1301_DEMOD=m +CONFIG_DVB_GP8PSK_FE=m +CONFIG_DVB_CXD2880=m + +# +# DVB-C (cable) frontends +# +CONFIG_DVB_VES1820=m +CONFIG_DVB_TDA10021=m +CONFIG_DVB_TDA10023=m +CONFIG_DVB_STV0297=m + +# +# ATSC (North American/Korean Terrestrial/Cable DTV) frontends +# +CONFIG_DVB_NXT200X=m +CONFIG_DVB_OR51211=m +CONFIG_DVB_OR51132=m +CONFIG_DVB_BCM3510=m +CONFIG_DVB_LGDT330X=m +CONFIG_DVB_LGDT3305=m +CONFIG_DVB_LGDT3306A=m +CONFIG_DVB_LG2160=m +CONFIG_DVB_S5H1409=m +CONFIG_DVB_AU8522=m +CONFIG_DVB_AU8522_DTV=m +CONFIG_DVB_AU8522_V4L=m +CONFIG_DVB_S5H1411=m + +# +# ISDB-T (terrestrial) frontends +# +CONFIG_DVB_S921=m +CONFIG_DVB_DIB8000=m +CONFIG_DVB_MB86A20S=m + +# +# ISDB-S (satellite) & ISDB-T (terrestrial) frontends +# +CONFIG_DVB_TC90522=m +CONFIG_DVB_MN88443X=m + +# +# Digital terrestrial only tuners/PLL +# +CONFIG_DVB_PLL=m +CONFIG_DVB_TUNER_DIB0070=m +CONFIG_DVB_TUNER_DIB0090=m + +# +# SEC control devices for DVB-S +# +CONFIG_DVB_DRX39XYJ=m +CONFIG_DVB_LNBH25=m +CONFIG_DVB_LNBH29=m +CONFIG_DVB_LNBP21=m +CONFIG_DVB_LNBP22=m +CONFIG_DVB_ISL6405=m +CONFIG_DVB_ISL6421=m +CONFIG_DVB_ISL6423=m +CONFIG_DVB_A8293=m +CONFIG_DVB_LGS8GL5=m +CONFIG_DVB_LGS8GXX=m +CONFIG_DVB_ATBM8830=m +CONFIG_DVB_TDA665x=m +CONFIG_DVB_IX2505V=m +CONFIG_DVB_M88RS2000=m +CONFIG_DVB_AF9033=m +CONFIG_DVB_HORUS3A=m +CONFIG_DVB_ASCOT2E=m +CONFIG_DVB_HELENE=m + +# +# Common Interface (EN50221) controller drivers +# +CONFIG_DVB_CXD2099=m +CONFIG_DVB_SP2=m + +# +# Tools to develop new frontends +# +CONFIG_DVB_DUMMY_FE=m +# end of Customise DVB Frontends + +# +# Graphics support +# +CONFIG_AGP=m +CONFIG_AGP_AMD64=m +CONFIG_AGP_INTEL=m +CONFIG_AGP_SIS=m +CONFIG_AGP_VIA=m +CONFIG_INTEL_GTT=m +CONFIG_VGA_ARB=y +CONFIG_VGA_ARB_MAX_GPUS=10 +CONFIG_VGA_SWITCHEROO=y +CONFIG_DRM=m +CONFIG_DRM_MIPI_DBI=m +CONFIG_DRM_MIPI_DSI=y +CONFIG_DRM_DP_AUX_CHARDEV=y +# CONFIG_DRM_DEBUG_SELFTEST is not set +CONFIG_DRM_KMS_HELPER=m +CONFIG_DRM_KMS_FB_HELPER=y +# CONFIG_DRM_DEBUG_DP_MST_TOPOLOGY_REFS is not set +CONFIG_DRM_FBDEV_EMULATION=y +CONFIG_DRM_FBDEV_OVERALLOC=100 +# CONFIG_DRM_FBDEV_LEAK_PHYS_SMEM is not set +CONFIG_DRM_LOAD_EDID_FIRMWARE=y +CONFIG_DRM_DP_CEC=y +CONFIG_DRM_TTM=m +CONFIG_DRM_TTM_DMA_PAGE_POOL=y +CONFIG_DRM_VRAM_HELPER=m +CONFIG_DRM_TTM_HELPER=m +CONFIG_DRM_GEM_CMA_HELPER=y +CONFIG_DRM_KMS_CMA_HELPER=y +CONFIG_DRM_GEM_SHMEM_HELPER=y +CONFIG_DRM_SCHED=m + +# +# I2C encoder or helper chips +# +CONFIG_DRM_I2C_CH7006=m +CONFIG_DRM_I2C_SIL164=m +CONFIG_DRM_I2C_NXP_TDA998X=m +CONFIG_DRM_I2C_NXP_TDA9950=m +# end of I2C encoder or helper chips + +# +# ARM devices +# +CONFIG_DRM_KOMEDA=m +CONFIG_DRM_KOMEDA_ERROR_PRINT=y +# end of ARM devices + +CONFIG_DRM_RADEON=m +CONFIG_DRM_RADEON_USERPTR=y +CONFIG_DRM_AMDGPU=m +CONFIG_DRM_AMDGPU_SI=y +CONFIG_DRM_AMDGPU_CIK=y +CONFIG_DRM_AMDGPU_USERPTR=y +# CONFIG_DRM_AMDGPU_GART_DEBUGFS is not set + +# +# ACP (Audio CoProcessor) Configuration +# +CONFIG_DRM_AMD_ACP=y +# end of ACP (Audio CoProcessor) Configuration + +# +# Display Engine Configuration +# +CONFIG_DRM_AMD_DC=y +CONFIG_DRM_AMD_DC_DCN1_0=y +CONFIG_DRM_AMD_DC_DCN2_0=y +CONFIG_DRM_AMD_DC_DCN2_1=y +CONFIG_DRM_AMD_DC_DSC_SUPPORT=y +CONFIG_DRM_AMD_DC_HDCP=y +# CONFIG_DEBUG_KERNEL_DC is not set +# end of Display Engine Configuration + +CONFIG_HSA_AMD=y +CONFIG_DRM_NOUVEAU=m +# CONFIG_NOUVEAU_LEGACY_CTX_SUPPORT is not set +CONFIG_NOUVEAU_DEBUG=5 +CONFIG_NOUVEAU_DEBUG_DEFAULT=3 +# CONFIG_NOUVEAU_DEBUG_MMU is not set +CONFIG_DRM_NOUVEAU_BACKLIGHT=y +CONFIG_DRM_NOUVEAU_SVM=y +CONFIG_DRM_I915=m +CONFIG_DRM_I915_ALPHA_SUPPORT=y +CONFIG_DRM_I915_FORCE_PROBE="*" +CONFIG_DRM_I915_CAPTURE_ERROR=y +CONFIG_DRM_I915_COMPRESS_ERROR=y +CONFIG_DRM_I915_USERPTR=y +CONFIG_DRM_I915_GVT=y +CONFIG_DRM_I915_GVT_KVMGT=m + +# +# drm/i915 Debugging +# +# CONFIG_DRM_I915_WERROR is not set +# CONFIG_DRM_I915_DEBUG is not set +# CONFIG_DRM_I915_DEBUG_MMIO is not set +# CONFIG_DRM_I915_SW_FENCE_DEBUG_OBJECTS is not set +# CONFIG_DRM_I915_SW_FENCE_CHECK_DAG is not set +# CONFIG_DRM_I915_DEBUG_GUC is not set +# CONFIG_DRM_I915_SELFTEST is not set +# CONFIG_DRM_I915_LOW_LEVEL_TRACEPOINTS is not set +# CONFIG_DRM_I915_DEBUG_VBLANK_EVADE is not set +# CONFIG_DRM_I915_DEBUG_RUNTIME_PM is not set +# end of drm/i915 Debugging + +# +# drm/i915 Profile Guided Optimisation +# +CONFIG_DRM_I915_USERFAULT_AUTOSUSPEND=250 +CONFIG_DRM_I915_HEARTBEAT_INTERVAL=2500 +CONFIG_DRM_I915_PREEMPT_TIMEOUT=640 +CONFIG_DRM_I915_SPIN_REQUEST=5 +CONFIG_DRM_I915_STOP_TIMEOUT=100 +CONFIG_DRM_I915_TIMESLICE_DURATION=1 +# end of drm/i915 Profile Guided Optimisation + +CONFIG_DRM_VGEM=m +CONFIG_DRM_VKMS=m +CONFIG_DRM_VMWGFX=m +CONFIG_DRM_VMWGFX_FBCON=y +CONFIG_DRM_GMA500=m +CONFIG_DRM_GMA600=y +CONFIG_DRM_GMA3600=y +CONFIG_DRM_UDL=m +CONFIG_DRM_AST=m +CONFIG_DRM_MGAG200=m +CONFIG_DRM_CIRRUS_QEMU=m +CONFIG_DRM_RCAR_DW_HDMI=m +CONFIG_DRM_RCAR_LVDS=m +CONFIG_DRM_QXL=m +CONFIG_DRM_BOCHS=m +CONFIG_DRM_VIRTIO_GPU=m +CONFIG_DRM_PANEL=y + +# +# Display Panels +# +CONFIG_DRM_PANEL_ARM_VERSATILE=m +CONFIG_DRM_PANEL_LVDS=m +CONFIG_DRM_PANEL_SIMPLE=m +CONFIG_DRM_PANEL_FEIYANG_FY07024DI26A30D=m +CONFIG_DRM_PANEL_ILITEK_IL9322=m +CONFIG_DRM_PANEL_ILITEK_ILI9881C=m +CONFIG_DRM_PANEL_INNOLUX_P079ZCA=m +CONFIG_DRM_PANEL_JDI_LT070ME05000=m +CONFIG_DRM_PANEL_KINGDISPLAY_KD097D04=m +CONFIG_DRM_PANEL_SAMSUNG_LD9040=m +CONFIG_DRM_PANEL_LG_LB035Q02=m +CONFIG_DRM_PANEL_LG_LG4573=m +CONFIG_DRM_PANEL_NEC_NL8048HL11=m +CONFIG_DRM_PANEL_NOVATEK_NT39016=m +CONFIG_DRM_PANEL_OLIMEX_LCD_OLINUXINO=m +CONFIG_DRM_PANEL_ORISETECH_OTM8009A=m +CONFIG_DRM_PANEL_OSD_OSD101T2587_53TS=m +CONFIG_DRM_PANEL_PANASONIC_VVX10F034N00=m +CONFIG_DRM_PANEL_RASPBERRYPI_TOUCHSCREEN=m +CONFIG_DRM_PANEL_RAYDIUM_RM67191=m +CONFIG_DRM_PANEL_RAYDIUM_RM68200=m +CONFIG_DRM_PANEL_ROCKTECH_JH057N00900=m +CONFIG_DRM_PANEL_RONBO_RB070D30=m +CONFIG_DRM_PANEL_SAMSUNG_S6D16D0=m +CONFIG_DRM_PANEL_SAMSUNG_S6E3HA2=m +CONFIG_DRM_PANEL_SAMSUNG_S6E63J0X03=m +CONFIG_DRM_PANEL_SAMSUNG_S6E63M0=m +CONFIG_DRM_PANEL_SAMSUNG_S6E8AA0=m +CONFIG_DRM_PANEL_SEIKO_43WVF1G=m +CONFIG_DRM_PANEL_SHARP_LQ101R1SX01=m +CONFIG_DRM_PANEL_SHARP_LS037V7DW01=m +CONFIG_DRM_PANEL_SHARP_LS043T1LE01=m +CONFIG_DRM_PANEL_SITRONIX_ST7701=m +CONFIG_DRM_PANEL_SITRONIX_ST7789V=m +CONFIG_DRM_PANEL_SONY_ACX565AKM=m +CONFIG_DRM_PANEL_TPO_TD028TTEC1=m +CONFIG_DRM_PANEL_TPO_TD043MTEA1=m +CONFIG_DRM_PANEL_TPO_TPG110=m +CONFIG_DRM_PANEL_TRULY_NT35597_WQXGA=m +# end of Display Panels + +CONFIG_DRM_BRIDGE=y +CONFIG_DRM_PANEL_BRIDGE=y + +# +# Display Interface Bridges +# +CONFIG_DRM_ANALOGIX_ANX78XX=m +CONFIG_DRM_CDNS_DSI=m +CONFIG_DRM_DUMB_VGA_DAC=m +CONFIG_DRM_LVDS_ENCODER=m +CONFIG_DRM_MEGACHIPS_STDPXXXX_GE_B850V3_FW=m +CONFIG_DRM_NXP_PTN3460=m +CONFIG_DRM_PARADE_PS8622=m +CONFIG_DRM_SIL_SII8620=m +CONFIG_DRM_SII902X=m +CONFIG_DRM_SII9234=m +CONFIG_DRM_THINE_THC63LVD1024=m +CONFIG_DRM_TOSHIBA_TC358764=m +CONFIG_DRM_TOSHIBA_TC358767=m +CONFIG_DRM_TI_TFP410=m +CONFIG_DRM_TI_SN65DSI86=m +CONFIG_DRM_I2C_ADV7511=m +CONFIG_DRM_I2C_ADV7511_AUDIO=y +CONFIG_DRM_I2C_ADV7533=y +CONFIG_DRM_I2C_ADV7511_CEC=y +CONFIG_DRM_DW_HDMI=m +CONFIG_DRM_DW_HDMI_AHB_AUDIO=m +CONFIG_DRM_DW_HDMI_I2S_AUDIO=m +CONFIG_DRM_DW_HDMI_CEC=m +# end of Display Interface Bridges + +# CONFIG_DRM_ETNAVIV is not set +CONFIG_DRM_ARCPGU=m +CONFIG_DRM_MXS=y +CONFIG_DRM_MXSFB=m +CONFIG_DRM_GM12U320=m +CONFIG_TINYDRM_HX8357D=m +CONFIG_TINYDRM_ILI9225=m +CONFIG_TINYDRM_ILI9341=m +CONFIG_TINYDRM_MI0283QT=m +CONFIG_TINYDRM_REPAPER=m +CONFIG_TINYDRM_ST7586=m +CONFIG_TINYDRM_ST7735R=m +CONFIG_DRM_XEN=y +CONFIG_DRM_XEN_FRONTEND=m +CONFIG_DRM_VBOXVIDEO=m +# CONFIG_DRM_LEGACY is not set +CONFIG_DRM_PANEL_ORIENTATION_QUIRKS=y + +# +# Frame buffer Devices +# +CONFIG_FB_CMDLINE=y +CONFIG_FB_NOTIFY=y +CONFIG_FB=y +CONFIG_FIRMWARE_EDID=y +CONFIG_FB_BOOT_VESA_SUPPORT=y +CONFIG_FB_CFB_FILLRECT=y +CONFIG_FB_CFB_COPYAREA=y +CONFIG_FB_CFB_IMAGEBLIT=y +CONFIG_FB_SYS_FILLRECT=m +CONFIG_FB_SYS_COPYAREA=m +CONFIG_FB_SYS_IMAGEBLIT=m +# CONFIG_FB_FOREIGN_ENDIAN is not set +CONFIG_FB_SYS_FOPS=m +CONFIG_FB_DEFERRED_IO=y +CONFIG_FB_BACKLIGHT=m +CONFIG_FB_MODE_HELPERS=y +CONFIG_FB_TILEBLITTING=y + +# +# Frame buffer hardware drivers +# +# CONFIG_FB_CIRRUS is not set +# CONFIG_FB_PM2 is not set +# CONFIG_FB_CYBER2000 is not set +# CONFIG_FB_ARC is not set +# CONFIG_FB_ASILIANT is not set +# CONFIG_FB_IMSTT is not set +# CONFIG_FB_VGA16 is not set +# CONFIG_FB_UVESA is not set +CONFIG_FB_VESA=y +CONFIG_FB_EFI=y +# CONFIG_FB_N411 is not set +# CONFIG_FB_HGA is not set +# CONFIG_FB_OPENCORES is not set +# CONFIG_FB_S1D13XXX is not set +# CONFIG_FB_NVIDIA is not set +# CONFIG_FB_RIVA is not set +# CONFIG_FB_I740 is not set +# CONFIG_FB_LE80578 is not set +# CONFIG_FB_INTEL is not set +# CONFIG_FB_MATROX is not set +# CONFIG_FB_RADEON is not set +# CONFIG_FB_ATY128 is not set +# CONFIG_FB_ATY is not set +# CONFIG_FB_S3 is not set +# CONFIG_FB_SAVAGE is not set +# CONFIG_FB_SIS is not set +# CONFIG_FB_VIA is not set +# CONFIG_FB_NEOMAGIC is not set +# CONFIG_FB_KYRO is not set +# CONFIG_FB_3DFX is not set +# CONFIG_FB_VOODOO1 is not set +# CONFIG_FB_VT8623 is not set +# CONFIG_FB_TRIDENT is not set +# CONFIG_FB_ARK is not set +# CONFIG_FB_PM3 is not set +# CONFIG_FB_CARMINE is not set +# CONFIG_FB_SM501 is not set +# CONFIG_FB_SMSCUFX is not set +# CONFIG_FB_UDL is not set +# CONFIG_FB_IBM_GXT4500 is not set +# CONFIG_FB_VIRTUAL is not set +CONFIG_XEN_FBDEV_FRONTEND=m +# CONFIG_FB_METRONOME is not set +# CONFIG_FB_MB862XX is not set +CONFIG_FB_HYPERV=m +CONFIG_FB_SIMPLE=y +# CONFIG_FB_SSD1307 is not set +# CONFIG_FB_SM712 is not set +# end of Frame buffer Devices + +# +# Backlight & LCD device support +# +CONFIG_LCD_CLASS_DEVICE=m +CONFIG_LCD_L4F00242T03=m +CONFIG_LCD_LMS283GF05=m +CONFIG_LCD_LTV350QV=m +CONFIG_LCD_ILI922X=m +CONFIG_LCD_ILI9320=m +CONFIG_LCD_TDO24M=m +CONFIG_LCD_VGG2432A4=m +CONFIG_LCD_PLATFORM=m +CONFIG_LCD_AMS369FG06=m +CONFIG_LCD_LMS501KF03=m +CONFIG_LCD_HX8357=m +CONFIG_LCD_OTM3225A=m +CONFIG_BACKLIGHT_CLASS_DEVICE=y +CONFIG_BACKLIGHT_GENERIC=m +CONFIG_BACKLIGHT_LM3533=m +CONFIG_BACKLIGHT_PWM=m +CONFIG_BACKLIGHT_DA903X=m +CONFIG_BACKLIGHT_DA9052=m +CONFIG_BACKLIGHT_MAX8925=m +CONFIG_BACKLIGHT_APPLE=m +CONFIG_BACKLIGHT_QCOM_WLED=m +CONFIG_BACKLIGHT_SAHARA=m +CONFIG_BACKLIGHT_WM831X=m +CONFIG_BACKLIGHT_ADP5520=m +CONFIG_BACKLIGHT_ADP8860=m +CONFIG_BACKLIGHT_ADP8870=m +CONFIG_BACKLIGHT_88PM860X=m +CONFIG_BACKLIGHT_PCF50633=m +CONFIG_BACKLIGHT_AAT2870=m +CONFIG_BACKLIGHT_LM3630A=m +CONFIG_BACKLIGHT_LM3639=m +CONFIG_BACKLIGHT_LP855X=m +CONFIG_BACKLIGHT_LP8788=m +CONFIG_BACKLIGHT_PANDORA=m +CONFIG_BACKLIGHT_SKY81452=m +CONFIG_BACKLIGHT_TPS65217=m +CONFIG_BACKLIGHT_AS3711=m +CONFIG_BACKLIGHT_GPIO=m +CONFIG_BACKLIGHT_LV5207LP=m +CONFIG_BACKLIGHT_BD6107=m +CONFIG_BACKLIGHT_ARCXCNN=m +CONFIG_BACKLIGHT_RAVE_SP=m +# end of Backlight & LCD device support + +CONFIG_VIDEOMODE_HELPERS=y +CONFIG_HDMI=y + +# +# Console display driver support +# +CONFIG_VGA_CONSOLE=y +CONFIG_VGACON_SOFT_SCROLLBACK=y +CONFIG_VGACON_SOFT_SCROLLBACK_SIZE=64 +# CONFIG_VGACON_SOFT_SCROLLBACK_PERSISTENT_ENABLE_BY_DEFAULT is not set +CONFIG_DUMMY_CONSOLE=y +CONFIG_DUMMY_CONSOLE_COLUMNS=80 +CONFIG_DUMMY_CONSOLE_ROWS=25 +CONFIG_FRAMEBUFFER_CONSOLE=y +CONFIG_FRAMEBUFFER_CONSOLE_DETECT_PRIMARY=y +CONFIG_FRAMEBUFFER_CONSOLE_ROTATION=y +CONFIG_FRAMEBUFFER_CONSOLE_DEFERRED_TAKEOVER=y +# end of Console display driver support + +# CONFIG_LOGO is not set +# end of Graphics support + +CONFIG_SOUND=m +CONFIG_SOUND_OSS_CORE=y +# CONFIG_SOUND_OSS_CORE_PRECLAIM is not set +CONFIG_SND=m +CONFIG_SND_TIMER=m +CONFIG_SND_PCM=m +CONFIG_SND_PCM_ELD=y +CONFIG_SND_PCM_IEC958=y +CONFIG_SND_DMAENGINE_PCM=m +CONFIG_SND_HWDEP=m +CONFIG_SND_SEQ_DEVICE=m +CONFIG_SND_RAWMIDI=m +CONFIG_SND_COMPRESS_OFFLOAD=m +CONFIG_SND_JACK=y +CONFIG_SND_JACK_INPUT_DEV=y +CONFIG_SND_OSSEMUL=y +CONFIG_SND_MIXER_OSS=m +CONFIG_SND_PCM_OSS=m +CONFIG_SND_PCM_OSS_PLUGINS=y +CONFIG_SND_PCM_TIMER=y +CONFIG_SND_HRTIMER=m +CONFIG_SND_DYNAMIC_MINORS=y +CONFIG_SND_MAX_CARDS=32 +# CONFIG_SND_SUPPORT_OLD_API is not set +CONFIG_SND_PROC_FS=y +CONFIG_SND_VERBOSE_PROCFS=y +CONFIG_SND_VERBOSE_PRINTK=y +CONFIG_SND_DEBUG=y +# CONFIG_SND_DEBUG_VERBOSE is not set +# CONFIG_SND_PCM_XRUN_DEBUG is not set +CONFIG_SND_VMASTER=y +CONFIG_SND_DMA_SGBUF=y +CONFIG_SND_SEQUENCER=m +CONFIG_SND_SEQ_DUMMY=m +CONFIG_SND_SEQUENCER_OSS=m +CONFIG_SND_SEQ_HRTIMER_DEFAULT=y +CONFIG_SND_SEQ_MIDI_EVENT=m +CONFIG_SND_SEQ_MIDI=m +CONFIG_SND_SEQ_MIDI_EMUL=m +CONFIG_SND_SEQ_VIRMIDI=m +CONFIG_SND_MPU401_UART=m +CONFIG_SND_OPL3_LIB=m +CONFIG_SND_OPL3_LIB_SEQ=m +CONFIG_SND_VX_LIB=m +CONFIG_SND_AC97_CODEC=m +CONFIG_SND_DRIVERS=y +# CONFIG_SND_PCSP is not set +CONFIG_SND_DUMMY=m +CONFIG_SND_ALOOP=m +CONFIG_SND_VIRMIDI=m +CONFIG_SND_MTPAV=m +CONFIG_SND_MTS64=m +CONFIG_SND_SERIAL_U16550=m +CONFIG_SND_MPU401=m +CONFIG_SND_PORTMAN2X4=m +CONFIG_SND_AC97_POWER_SAVE=y +CONFIG_SND_AC97_POWER_SAVE_DEFAULT=0 +CONFIG_SND_SB_COMMON=m +CONFIG_SND_PCI=y +CONFIG_SND_AD1889=m +CONFIG_SND_ALS300=m +CONFIG_SND_ALS4000=m +CONFIG_SND_ALI5451=m +CONFIG_SND_ASIHPI=m +CONFIG_SND_ATIIXP=m +CONFIG_SND_ATIIXP_MODEM=m +CONFIG_SND_AU8810=m +CONFIG_SND_AU8820=m +CONFIG_SND_AU8830=m +CONFIG_SND_AW2=m +CONFIG_SND_AZT3328=m +CONFIG_SND_BT87X=m +# CONFIG_SND_BT87X_OVERCLOCK is not set +CONFIG_SND_CA0106=m +CONFIG_SND_CMIPCI=m +CONFIG_SND_OXYGEN_LIB=m +CONFIG_SND_OXYGEN=m +CONFIG_SND_CS4281=m +CONFIG_SND_CS46XX=m +CONFIG_SND_CS46XX_NEW_DSP=y +CONFIG_SND_CTXFI=m +CONFIG_SND_DARLA20=m +CONFIG_SND_GINA20=m +CONFIG_SND_LAYLA20=m +CONFIG_SND_DARLA24=m +CONFIG_SND_GINA24=m +CONFIG_SND_LAYLA24=m +CONFIG_SND_MONA=m +CONFIG_SND_MIA=m +CONFIG_SND_ECHO3G=m +CONFIG_SND_INDIGO=m +CONFIG_SND_INDIGOIO=m +CONFIG_SND_INDIGODJ=m +CONFIG_SND_INDIGOIOX=m +CONFIG_SND_INDIGODJX=m +CONFIG_SND_EMU10K1=m +CONFIG_SND_EMU10K1_SEQ=m +CONFIG_SND_EMU10K1X=m +CONFIG_SND_ENS1370=m +CONFIG_SND_ENS1371=m +CONFIG_SND_ES1938=m +CONFIG_SND_ES1968=m +CONFIG_SND_ES1968_INPUT=y +CONFIG_SND_ES1968_RADIO=y +CONFIG_SND_FM801=m +CONFIG_SND_FM801_TEA575X_BOOL=y +CONFIG_SND_HDSP=m +CONFIG_SND_HDSPM=m +CONFIG_SND_ICE1712=m +CONFIG_SND_ICE1724=m +CONFIG_SND_INTEL8X0=m +CONFIG_SND_INTEL8X0M=m +CONFIG_SND_KORG1212=m +CONFIG_SND_LOLA=m +CONFIG_SND_LX6464ES=m +CONFIG_SND_MAESTRO3=m +CONFIG_SND_MAESTRO3_INPUT=y +CONFIG_SND_MIXART=m +CONFIG_SND_NM256=m +CONFIG_SND_PCXHR=m +CONFIG_SND_RIPTIDE=m +CONFIG_SND_RME32=m +CONFIG_SND_RME96=m +CONFIG_SND_RME9652=m +CONFIG_SND_SONICVIBES=m +CONFIG_SND_TRIDENT=m +CONFIG_SND_VIA82XX=m +CONFIG_SND_VIA82XX_MODEM=m +CONFIG_SND_VIRTUOSO=m +CONFIG_SND_VX222=m +CONFIG_SND_YMFPCI=m + +# +# HD-Audio +# +CONFIG_SND_HDA=m +CONFIG_SND_HDA_INTEL=m +CONFIG_SND_HDA_HWDEP=y +CONFIG_SND_HDA_RECONFIG=y +CONFIG_SND_HDA_INPUT_BEEP=y +CONFIG_SND_HDA_INPUT_BEEP_MODE=1 +CONFIG_SND_HDA_PATCH_LOADER=y +CONFIG_SND_HDA_CODEC_REALTEK=m +CONFIG_SND_HDA_CODEC_ANALOG=m +CONFIG_SND_HDA_CODEC_SIGMATEL=m +CONFIG_SND_HDA_CODEC_VIA=m +CONFIG_SND_HDA_CODEC_HDMI=m +CONFIG_SND_HDA_CODEC_CIRRUS=m +CONFIG_SND_HDA_CODEC_CONEXANT=m +CONFIG_SND_HDA_CODEC_CA0110=m +CONFIG_SND_HDA_CODEC_CA0132=m +CONFIG_SND_HDA_CODEC_CA0132_DSP=y +CONFIG_SND_HDA_CODEC_CMEDIA=m +CONFIG_SND_HDA_CODEC_SI3054=m +CONFIG_SND_HDA_GENERIC=m +CONFIG_SND_HDA_POWER_SAVE_DEFAULT=0 +# end of HD-Audio + +CONFIG_SND_HDA_CORE=m +CONFIG_SND_HDA_DSP_LOADER=y +CONFIG_SND_HDA_COMPONENT=y +CONFIG_SND_HDA_I915=y +CONFIG_SND_HDA_EXT_CORE=m +CONFIG_SND_HDA_PREALLOC_SIZE=4096 +CONFIG_SND_INTEL_NHLT=y +CONFIG_SND_INTEL_DSP_CONFIG=m +CONFIG_SND_SPI=y +CONFIG_SND_USB=y +CONFIG_SND_USB_AUDIO=m +CONFIG_SND_USB_AUDIO_USE_MEDIA_CONTROLLER=y +CONFIG_SND_USB_UA101=m +CONFIG_SND_USB_USX2Y=m +CONFIG_SND_USB_CAIAQ=m +CONFIG_SND_USB_CAIAQ_INPUT=y +CONFIG_SND_USB_US122L=m +CONFIG_SND_USB_6FIRE=m +CONFIG_SND_USB_HIFACE=m +CONFIG_SND_BCD2000=m +CONFIG_SND_USB_LINE6=m +CONFIG_SND_USB_POD=m +CONFIG_SND_USB_PODHD=m +CONFIG_SND_USB_TONEPORT=m +CONFIG_SND_USB_VARIAX=m +CONFIG_SND_FIREWIRE=y +CONFIG_SND_FIREWIRE_LIB=m +CONFIG_SND_DICE=m +CONFIG_SND_OXFW=m +CONFIG_SND_ISIGHT=m +CONFIG_SND_FIREWORKS=m +CONFIG_SND_BEBOB=m +CONFIG_SND_FIREWIRE_DIGI00X=m +CONFIG_SND_FIREWIRE_TASCAM=m +CONFIG_SND_FIREWIRE_MOTU=m +CONFIG_SND_FIREFACE=m +CONFIG_SND_PCMCIA=y +CONFIG_SND_VXPOCKET=m +CONFIG_SND_PDAUDIOCF=m +CONFIG_SND_SOC=m +CONFIG_SND_SOC_AC97_BUS=y +CONFIG_SND_SOC_GENERIC_DMAENGINE_PCM=y +CONFIG_SND_SOC_COMPRESS=y +CONFIG_SND_SOC_TOPOLOGY=y +CONFIG_SND_SOC_ACPI=m +CONFIG_SND_SOC_AMD_ACP=m +CONFIG_SND_SOC_AMD_CZ_DA7219MX98357_MACH=m +CONFIG_SND_SOC_AMD_CZ_RT5645_MACH=m +CONFIG_SND_SOC_AMD_ACP3x=m +CONFIG_SND_ATMEL_SOC=m +CONFIG_SND_SOC_MIKROE_PROTO=m +CONFIG_SND_DESIGNWARE_I2S=m +CONFIG_SND_DESIGNWARE_PCM=y + +# +# SoC Audio for Freescale CPUs +# + +# +# Common SoC Audio options for Freescale CPUs: +# +# CONFIG_SND_SOC_FSL_ASRC is not set +# CONFIG_SND_SOC_FSL_SAI is not set +# CONFIG_SND_SOC_FSL_AUDMIX is not set +# CONFIG_SND_SOC_FSL_SSI is not set +# CONFIG_SND_SOC_FSL_SPDIF is not set +# CONFIG_SND_SOC_FSL_ESAI is not set +# CONFIG_SND_SOC_FSL_MICFIL is not set +# CONFIG_SND_SOC_IMX_AUDMUX is not set +# end of SoC Audio for Freescale CPUs + +CONFIG_SND_I2S_HI6210_I2S=m +CONFIG_SND_SOC_IMG=y +CONFIG_SND_SOC_IMG_I2S_IN=m +CONFIG_SND_SOC_IMG_I2S_OUT=m +CONFIG_SND_SOC_IMG_PARALLEL_OUT=m +CONFIG_SND_SOC_IMG_SPDIF_IN=m +CONFIG_SND_SOC_IMG_SPDIF_OUT=m +CONFIG_SND_SOC_IMG_PISTACHIO_INTERNAL_DAC=m +CONFIG_SND_SOC_INTEL_SST_TOPLEVEL=y +CONFIG_SND_SST_IPC=m +CONFIG_SND_SST_IPC_PCI=m +CONFIG_SND_SST_IPC_ACPI=m +CONFIG_SND_SOC_INTEL_SST_ACPI=m +CONFIG_SND_SOC_INTEL_SST=m +CONFIG_SND_SOC_INTEL_SST_FIRMWARE=m +CONFIG_SND_SOC_INTEL_HASWELL=m +CONFIG_SND_SST_ATOM_HIFI2_PLATFORM=m +CONFIG_SND_SST_ATOM_HIFI2_PLATFORM_PCI=m +CONFIG_SND_SST_ATOM_HIFI2_PLATFORM_ACPI=m +CONFIG_SND_SOC_INTEL_SKYLAKE=m +CONFIG_SND_SOC_INTEL_SKL=m +CONFIG_SND_SOC_INTEL_APL=m +CONFIG_SND_SOC_INTEL_KBL=m +CONFIG_SND_SOC_INTEL_GLK=m +CONFIG_SND_SOC_INTEL_CNL=m +CONFIG_SND_SOC_INTEL_CFL=m +CONFIG_SND_SOC_INTEL_CML_H=m +CONFIG_SND_SOC_INTEL_CML_LP=m +CONFIG_SND_SOC_INTEL_SKYLAKE_FAMILY=m +CONFIG_SND_SOC_INTEL_SKYLAKE_SSP_CLK=m +# CONFIG_SND_SOC_INTEL_SKYLAKE_HDAUDIO_CODEC is not set +CONFIG_SND_SOC_INTEL_SKYLAKE_COMMON=m +CONFIG_SND_SOC_ACPI_INTEL_MATCH=m +CONFIG_SND_SOC_INTEL_MACH=y +CONFIG_SND_SOC_INTEL_HASWELL_MACH=m +CONFIG_SND_SOC_INTEL_BDW_RT5677_MACH=m +CONFIG_SND_SOC_INTEL_BROADWELL_MACH=m +CONFIG_SND_SOC_INTEL_BYTCR_RT5640_MACH=m +CONFIG_SND_SOC_INTEL_BYTCR_RT5651_MACH=m +CONFIG_SND_SOC_INTEL_CHT_BSW_RT5672_MACH=m +CONFIG_SND_SOC_INTEL_CHT_BSW_RT5645_MACH=m +CONFIG_SND_SOC_INTEL_CHT_BSW_MAX98090_TI_MACH=m +CONFIG_SND_SOC_INTEL_CHT_BSW_NAU8824_MACH=m +CONFIG_SND_SOC_INTEL_BYT_CHT_CX2072X_MACH=m +CONFIG_SND_SOC_INTEL_BYT_CHT_DA7213_MACH=m +CONFIG_SND_SOC_INTEL_BYT_CHT_ES8316_MACH=m +# CONFIG_SND_SOC_INTEL_BYT_CHT_NOCODEC_MACH is not set +CONFIG_SND_SOC_INTEL_SKL_RT286_MACH=m +CONFIG_SND_SOC_INTEL_SKL_NAU88L25_SSM4567_MACH=m +CONFIG_SND_SOC_INTEL_SKL_NAU88L25_MAX98357A_MACH=m +CONFIG_SND_SOC_INTEL_DA7219_MAX98357A_GENERIC=m +CONFIG_SND_SOC_INTEL_BXT_DA7219_MAX98357A_COMMON=m +CONFIG_SND_SOC_INTEL_BXT_DA7219_MAX98357A_MACH=m +CONFIG_SND_SOC_INTEL_BXT_RT298_MACH=m +CONFIG_SND_SOC_INTEL_KBL_RT5663_MAX98927_MACH=m +CONFIG_SND_SOC_INTEL_KBL_RT5663_RT5514_MAX98927_MACH=m +CONFIG_SND_SOC_INTEL_KBL_DA7219_MAX98357A_MACH=m +CONFIG_SND_SOC_INTEL_KBL_DA7219_MAX98927_MACH=m +CONFIG_SND_SOC_INTEL_KBL_RT5660_MACH=m +CONFIG_SND_SOC_INTEL_GLK_DA7219_MAX98357A_MACH=m +CONFIG_SND_SOC_INTEL_GLK_RT5682_MAX98357A_MACH=m +CONFIG_SND_SOC_INTEL_SKL_HDA_DSP_GENERIC_MACH=m +CONFIG_SND_SOC_INTEL_SOF_RT5682_MACH=m +CONFIG_SND_SOC_INTEL_CML_LP_DA7219_MAX98357A_MACH=m +CONFIG_SND_SOC_INTEL_SOF_CML_RT1011_RT5682_MACH=m +CONFIG_SND_SOC_MTK_BTCVSD=m +CONFIG_SND_SOC_SOF_TOPLEVEL=y +CONFIG_SND_SOC_SOF_PCI=m +CONFIG_SND_SOC_SOF_ACPI=m +CONFIG_SND_SOC_SOF_OF=m +# CONFIG_SND_SOC_SOF_DEVELOPER_SUPPORT is not set +CONFIG_SND_SOC_SOF=m +CONFIG_SND_SOC_SOF_PROBE_WORK_QUEUE=y +CONFIG_SND_SOC_SOF_INTEL_TOPLEVEL=y +CONFIG_SND_SOC_SOF_INTEL_ACPI=m +CONFIG_SND_SOC_SOF_INTEL_PCI=m +CONFIG_SND_SOC_SOF_INTEL_HIFI_EP_IPC=m +CONFIG_SND_SOC_SOF_INTEL_ATOM_HIFI_EP=m +CONFIG_SND_SOC_SOF_INTEL_COMMON=m +CONFIG_SND_SOC_SOF_MERRIFIELD_SUPPORT=y +CONFIG_SND_SOC_SOF_MERRIFIELD=m +CONFIG_SND_SOC_SOF_APOLLOLAKE_SUPPORT=y +CONFIG_SND_SOC_SOF_APOLLOLAKE=m +CONFIG_SND_SOC_SOF_GEMINILAKE_SUPPORT=y +CONFIG_SND_SOC_SOF_GEMINILAKE=m +CONFIG_SND_SOC_SOF_CANNONLAKE_SUPPORT=y +CONFIG_SND_SOC_SOF_CANNONLAKE=m +CONFIG_SND_SOC_SOF_COFFEELAKE_SUPPORT=y +CONFIG_SND_SOC_SOF_COFFEELAKE=m +CONFIG_SND_SOC_SOF_ICELAKE_SUPPORT=y +CONFIG_SND_SOC_SOF_ICELAKE=m +CONFIG_SND_SOC_SOF_COMETLAKE_LP=m +CONFIG_SND_SOC_SOF_COMETLAKE_LP_SUPPORT=y +CONFIG_SND_SOC_SOF_COMETLAKE_H=m +CONFIG_SND_SOC_SOF_COMETLAKE_H_SUPPORT=y +CONFIG_SND_SOC_SOF_TIGERLAKE_SUPPORT=y +CONFIG_SND_SOC_SOF_TIGERLAKE=m +CONFIG_SND_SOC_SOF_ELKHARTLAKE_SUPPORT=y +CONFIG_SND_SOC_SOF_ELKHARTLAKE=m +CONFIG_SND_SOC_SOF_JASPERLAKE_SUPPORT=y +CONFIG_SND_SOC_SOF_JASPERLAKE=m +CONFIG_SND_SOC_SOF_HDA_COMMON=m +CONFIG_SND_SOC_SOF_HDA_LINK=y +CONFIG_SND_SOC_SOF_HDA_AUDIO_CODEC=y +# CONFIG_SND_SOC_SOF_HDA_ALWAYS_ENABLE_DMI_L1 is not set +CONFIG_SND_SOC_SOF_HDA_COMMON_HDMI_CODEC=y +CONFIG_SND_SOC_SOF_HDA_LINK_BASELINE=m +CONFIG_SND_SOC_SOF_HDA=m +CONFIG_SND_SOC_SOF_XTENSA=m + +# +# STMicroelectronics STM32 SOC audio support +# +# end of STMicroelectronics STM32 SOC audio support + +CONFIG_SND_SOC_XILINX_I2S=m +CONFIG_SND_SOC_XILINX_AUDIO_FORMATTER=m +CONFIG_SND_SOC_XILINX_SPDIF=m +CONFIG_SND_SOC_XTFPGA_I2S=m +CONFIG_ZX_TDM=m +CONFIG_SND_SOC_I2C_AND_SPI=m + +# +# CODEC drivers +# +CONFIG_SND_SOC_AC97_CODEC=m +CONFIG_SND_SOC_ADAU_UTILS=m +CONFIG_SND_SOC_ADAU1701=m +CONFIG_SND_SOC_ADAU17X1=m +CONFIG_SND_SOC_ADAU1761=m +CONFIG_SND_SOC_ADAU1761_I2C=m +CONFIG_SND_SOC_ADAU1761_SPI=m +CONFIG_SND_SOC_ADAU7002=m +CONFIG_SND_SOC_ADAU7118=m +CONFIG_SND_SOC_ADAU7118_HW=m +CONFIG_SND_SOC_ADAU7118_I2C=m +CONFIG_SND_SOC_AK4104=m +CONFIG_SND_SOC_AK4118=m +CONFIG_SND_SOC_AK4458=m +CONFIG_SND_SOC_AK4554=m +CONFIG_SND_SOC_AK4613=m +CONFIG_SND_SOC_AK4642=m +CONFIG_SND_SOC_AK5386=m +CONFIG_SND_SOC_AK5558=m +CONFIG_SND_SOC_ALC5623=m +CONFIG_SND_SOC_BD28623=m +# CONFIG_SND_SOC_BT_SCO is not set +CONFIG_SND_SOC_CPCAP=m +CONFIG_SND_SOC_CROS_EC_CODEC=m +CONFIG_SND_SOC_CS35L32=m +CONFIG_SND_SOC_CS35L33=m +CONFIG_SND_SOC_CS35L34=m +CONFIG_SND_SOC_CS35L35=m +CONFIG_SND_SOC_CS35L36=m +CONFIG_SND_SOC_CS42L42=m +CONFIG_SND_SOC_CS42L51=m +CONFIG_SND_SOC_CS42L51_I2C=m +CONFIG_SND_SOC_CS42L52=m +CONFIG_SND_SOC_CS42L56=m +CONFIG_SND_SOC_CS42L73=m +CONFIG_SND_SOC_CS4265=m +CONFIG_SND_SOC_CS4270=m +CONFIG_SND_SOC_CS4271=m +CONFIG_SND_SOC_CS4271_I2C=m +CONFIG_SND_SOC_CS4271_SPI=m +CONFIG_SND_SOC_CS42XX8=m +CONFIG_SND_SOC_CS42XX8_I2C=m +CONFIG_SND_SOC_CS43130=m +CONFIG_SND_SOC_CS4341=m +CONFIG_SND_SOC_CS4349=m +CONFIG_SND_SOC_CS53L30=m +CONFIG_SND_SOC_CX2072X=m +CONFIG_SND_SOC_DA7213=m +CONFIG_SND_SOC_DA7219=m +CONFIG_SND_SOC_DMIC=m +CONFIG_SND_SOC_HDMI_CODEC=m +CONFIG_SND_SOC_ES7134=m +CONFIG_SND_SOC_ES7241=m +CONFIG_SND_SOC_ES8316=m +CONFIG_SND_SOC_ES8328=m +CONFIG_SND_SOC_ES8328_I2C=m +CONFIG_SND_SOC_ES8328_SPI=m +CONFIG_SND_SOC_GTM601=m +CONFIG_SND_SOC_HDAC_HDMI=m +CONFIG_SND_SOC_HDAC_HDA=m +CONFIG_SND_SOC_INNO_RK3036=m +CONFIG_SND_SOC_LOCHNAGAR_SC=m +CONFIG_SND_SOC_MAX98088=m +CONFIG_SND_SOC_MAX98090=m +CONFIG_SND_SOC_MAX98357A=m +CONFIG_SND_SOC_MAX98504=m +CONFIG_SND_SOC_MAX9867=m +CONFIG_SND_SOC_MAX98927=m +CONFIG_SND_SOC_MAX98373=m +CONFIG_SND_SOC_MAX9860=m +CONFIG_SND_SOC_MSM8916_WCD_ANALOG=m +CONFIG_SND_SOC_MSM8916_WCD_DIGITAL=m +CONFIG_SND_SOC_PCM1681=m +CONFIG_SND_SOC_PCM1789=m +CONFIG_SND_SOC_PCM1789_I2C=m +CONFIG_SND_SOC_PCM179X=m +CONFIG_SND_SOC_PCM179X_I2C=m +CONFIG_SND_SOC_PCM179X_SPI=m +CONFIG_SND_SOC_PCM186X=m +CONFIG_SND_SOC_PCM186X_I2C=m +CONFIG_SND_SOC_PCM186X_SPI=m +CONFIG_SND_SOC_PCM3060=m +CONFIG_SND_SOC_PCM3060_I2C=m +CONFIG_SND_SOC_PCM3060_SPI=m +CONFIG_SND_SOC_PCM3168A=m +CONFIG_SND_SOC_PCM3168A_I2C=m +CONFIG_SND_SOC_PCM3168A_SPI=m +CONFIG_SND_SOC_PCM512x=m +CONFIG_SND_SOC_PCM512x_I2C=m +CONFIG_SND_SOC_PCM512x_SPI=m +CONFIG_SND_SOC_RK3328=m +CONFIG_SND_SOC_RL6231=m +CONFIG_SND_SOC_RL6347A=m +CONFIG_SND_SOC_RT286=m +CONFIG_SND_SOC_RT298=m +CONFIG_SND_SOC_RT1011=m +CONFIG_SND_SOC_RT5514=m +CONFIG_SND_SOC_RT5514_SPI=m +CONFIG_SND_SOC_RT5616=m +CONFIG_SND_SOC_RT5631=m +CONFIG_SND_SOC_RT5640=m +CONFIG_SND_SOC_RT5645=m +CONFIG_SND_SOC_RT5651=m +CONFIG_SND_SOC_RT5660=m +CONFIG_SND_SOC_RT5663=m +CONFIG_SND_SOC_RT5670=m +CONFIG_SND_SOC_RT5677=m +CONFIG_SND_SOC_RT5677_SPI=m +CONFIG_SND_SOC_RT5682=m +CONFIG_SND_SOC_SGTL5000=m +CONFIG_SND_SOC_SI476X=m +CONFIG_SND_SOC_SIGMADSP=m +CONFIG_SND_SOC_SIGMADSP_I2C=m +CONFIG_SND_SOC_SIGMADSP_REGMAP=m +CONFIG_SND_SOC_SIMPLE_AMPLIFIER=m +CONFIG_SND_SOC_SIRF_AUDIO_CODEC=m +CONFIG_SND_SOC_SPDIF=m +CONFIG_SND_SOC_SSM2305=m +CONFIG_SND_SOC_SSM2602=m +CONFIG_SND_SOC_SSM2602_SPI=m +CONFIG_SND_SOC_SSM2602_I2C=m +CONFIG_SND_SOC_SSM4567=m +CONFIG_SND_SOC_STA32X=m +CONFIG_SND_SOC_STA350=m +CONFIG_SND_SOC_STI_SAS=m +CONFIG_SND_SOC_TAS2552=m +CONFIG_SND_SOC_TAS2562=m +CONFIG_SND_SOC_TAS2770=m +CONFIG_SND_SOC_TAS5086=m +CONFIG_SND_SOC_TAS571X=m +CONFIG_SND_SOC_TAS5720=m +CONFIG_SND_SOC_TAS6424=m +CONFIG_SND_SOC_TDA7419=m +CONFIG_SND_SOC_TFA9879=m +CONFIG_SND_SOC_TLV320AIC23=m +CONFIG_SND_SOC_TLV320AIC23_I2C=m +CONFIG_SND_SOC_TLV320AIC23_SPI=m +CONFIG_SND_SOC_TLV320AIC31XX=m +CONFIG_SND_SOC_TLV320AIC32X4=m +CONFIG_SND_SOC_TLV320AIC32X4_I2C=m +CONFIG_SND_SOC_TLV320AIC32X4_SPI=m +CONFIG_SND_SOC_TLV320AIC3X=m +CONFIG_SND_SOC_TS3A227E=m +CONFIG_SND_SOC_TSCS42XX=m +CONFIG_SND_SOC_TSCS454=m +CONFIG_SND_SOC_UDA1334=m +CONFIG_SND_SOC_WCD9335=m +CONFIG_SND_SOC_WM8510=m +CONFIG_SND_SOC_WM8523=m +CONFIG_SND_SOC_WM8524=m +CONFIG_SND_SOC_WM8580=m +CONFIG_SND_SOC_WM8711=m +CONFIG_SND_SOC_WM8728=m +CONFIG_SND_SOC_WM8731=m +CONFIG_SND_SOC_WM8737=m +CONFIG_SND_SOC_WM8741=m +CONFIG_SND_SOC_WM8750=m +CONFIG_SND_SOC_WM8753=m +CONFIG_SND_SOC_WM8770=m +CONFIG_SND_SOC_WM8776=m +CONFIG_SND_SOC_WM8782=m +CONFIG_SND_SOC_WM8804=m +CONFIG_SND_SOC_WM8804_I2C=m +CONFIG_SND_SOC_WM8804_SPI=m +CONFIG_SND_SOC_WM8903=m +CONFIG_SND_SOC_WM8904=m +CONFIG_SND_SOC_WM8960=m +CONFIG_SND_SOC_WM8962=m +CONFIG_SND_SOC_WM8974=m +CONFIG_SND_SOC_WM8978=m +CONFIG_SND_SOC_WM8985=m +CONFIG_SND_SOC_ZX_AUD96P22=m +CONFIG_SND_SOC_MAX9759=m +CONFIG_SND_SOC_MT6351=m +CONFIG_SND_SOC_MT6358=m +CONFIG_SND_SOC_NAU8540=m +CONFIG_SND_SOC_NAU8810=m +CONFIG_SND_SOC_NAU8822=m +CONFIG_SND_SOC_NAU8824=m +CONFIG_SND_SOC_NAU8825=m +CONFIG_SND_SOC_TPA6130A2=m +# end of CODEC drivers + +CONFIG_SND_SIMPLE_CARD_UTILS=m +CONFIG_SND_SIMPLE_CARD=m +CONFIG_SND_AUDIO_GRAPH_CARD=m +CONFIG_SND_X86=y +CONFIG_HDMI_LPE_AUDIO=m +CONFIG_SND_SYNTH_EMUX=m +CONFIG_SND_XEN_FRONTEND=m +CONFIG_AC97_BUS=m + +# +# HID support +# +CONFIG_HID=m +CONFIG_HID_BATTERY_STRENGTH=y +CONFIG_HIDRAW=y +CONFIG_UHID=m +CONFIG_HID_GENERIC=m + +# +# Special HID drivers +# +CONFIG_HID_A4TECH=m +CONFIG_HID_ACCUTOUCH=m +CONFIG_HID_ACRUX=m +CONFIG_HID_ACRUX_FF=y +CONFIG_HID_APPLE=m +CONFIG_HID_APPLEIR=m +CONFIG_HID_ASUS=m +CONFIG_HID_AUREAL=m +CONFIG_HID_BELKIN=m +CONFIG_HID_BETOP_FF=m +CONFIG_HID_BIGBEN_FF=m +CONFIG_HID_CHERRY=m +CONFIG_HID_CHICONY=m +CONFIG_HID_CORSAIR=m +CONFIG_HID_COUGAR=m +CONFIG_HID_MACALLY=m +CONFIG_HID_PRODIKEYS=m +CONFIG_HID_CMEDIA=m +CONFIG_HID_CP2112=m +CONFIG_HID_CREATIVE_SB0540=m +CONFIG_HID_CYPRESS=m +CONFIG_HID_DRAGONRISE=m +CONFIG_DRAGONRISE_FF=y +CONFIG_HID_EMS_FF=m +CONFIG_HID_ELAN=m +CONFIG_HID_ELECOM=m +CONFIG_HID_ELO=m +CONFIG_HID_EZKEY=m +CONFIG_HID_GEMBIRD=m +CONFIG_HID_GFRM=m +CONFIG_HID_HOLTEK=m +CONFIG_HOLTEK_FF=y +CONFIG_HID_GOOGLE_HAMMER=m +CONFIG_HID_GT683R=m +CONFIG_HID_KEYTOUCH=m +CONFIG_HID_KYE=m +CONFIG_HID_UCLOGIC=m +CONFIG_HID_WALTOP=m +CONFIG_HID_VIEWSONIC=m +CONFIG_HID_GYRATION=m +CONFIG_HID_ICADE=m +CONFIG_HID_ITE=m +CONFIG_HID_JABRA=m +CONFIG_HID_TWINHAN=m +CONFIG_HID_KENSINGTON=m +CONFIG_HID_LCPOWER=m +CONFIG_HID_LED=m +CONFIG_HID_LENOVO=m +CONFIG_HID_LOGITECH=m +CONFIG_HID_LOGITECH_DJ=m +CONFIG_HID_LOGITECH_HIDPP=m +CONFIG_LOGITECH_FF=y +CONFIG_LOGIRUMBLEPAD2_FF=y +CONFIG_LOGIG940_FF=y +CONFIG_LOGIWHEELS_FF=y +CONFIG_HID_MAGICMOUSE=m +CONFIG_HID_MALTRON=m +CONFIG_HID_MAYFLASH=m +CONFIG_HID_REDRAGON=m +CONFIG_HID_MICROSOFT=m +CONFIG_HID_MONTEREY=m +CONFIG_HID_MULTITOUCH=m +CONFIG_HID_NTI=m +CONFIG_HID_NTRIG=m +CONFIG_HID_ORTEK=m +CONFIG_HID_PANTHERLORD=m +CONFIG_PANTHERLORD_FF=y +CONFIG_HID_PENMOUNT=m +CONFIG_HID_PETALYNX=m +CONFIG_HID_PICOLCD=m +CONFIG_HID_PICOLCD_FB=y +CONFIG_HID_PICOLCD_BACKLIGHT=y +CONFIG_HID_PICOLCD_LCD=y +CONFIG_HID_PICOLCD_LEDS=y +CONFIG_HID_PICOLCD_CIR=y +CONFIG_HID_PLANTRONICS=m +CONFIG_HID_PRIMAX=m +CONFIG_HID_RETRODE=m +CONFIG_HID_ROCCAT=m +CONFIG_HID_SAITEK=m +CONFIG_HID_SAMSUNG=m +CONFIG_HID_SONY=m +CONFIG_SONY_FF=y +CONFIG_HID_SPEEDLINK=m +CONFIG_HID_STEAM=m +CONFIG_HID_STEELSERIES=m +CONFIG_HID_SUNPLUS=m +CONFIG_HID_RMI=m +CONFIG_HID_GREENASIA=m +CONFIG_GREENASIA_FF=y +CONFIG_HID_HYPERV_MOUSE=m +CONFIG_HID_SMARTJOYPLUS=m +CONFIG_SMARTJOYPLUS_FF=y +CONFIG_HID_TIVO=m +CONFIG_HID_TOPSEED=m +CONFIG_HID_THINGM=m +CONFIG_HID_THRUSTMASTER=m +CONFIG_THRUSTMASTER_FF=y +CONFIG_HID_UDRAW_PS3=m +CONFIG_HID_U2FZERO=m +CONFIG_HID_WACOM=m +CONFIG_HID_WIIMOTE=m +CONFIG_HID_XINMO=m +CONFIG_HID_ZEROPLUS=m +CONFIG_ZEROPLUS_FF=y +CONFIG_HID_ZYDACRON=m +CONFIG_HID_SENSOR_HUB=m +# CONFIG_HID_SENSOR_CUSTOM_SENSOR is not set +CONFIG_HID_ALPS=m +# end of Special HID drivers + +# +# USB HID support +# +CONFIG_USB_HID=m +CONFIG_HID_PID=y +CONFIG_USB_HIDDEV=y + +# +# USB HID Boot Protocol drivers +# +# CONFIG_USB_KBD is not set +# CONFIG_USB_MOUSE is not set +# end of USB HID Boot Protocol drivers +# end of USB HID support + +# +# I2C HID support +# +CONFIG_I2C_HID=m +# end of I2C HID support + +# +# Intel ISH HID support +# +CONFIG_INTEL_ISH_HID=m +CONFIG_INTEL_ISH_FIRMWARE_DOWNLOADER=m +# end of Intel ISH HID support +# end of HID support + +CONFIG_USB_OHCI_LITTLE_ENDIAN=y +CONFIG_USB_SUPPORT=y +CONFIG_USB_COMMON=y +CONFIG_USB_LED_TRIG=y +CONFIG_USB_ULPI_BUS=m +CONFIG_USB_CONN_GPIO=m +CONFIG_USB_ARCH_HAS_HCD=y +CONFIG_USB=y +CONFIG_USB_PCI=y +CONFIG_USB_ANNOUNCE_NEW_DEVICES=y + +# +# Miscellaneous USB options +# +CONFIG_USB_DEFAULT_PERSIST=y +CONFIG_USB_DYNAMIC_MINORS=y +# CONFIG_USB_OTG is not set +# CONFIG_USB_OTG_WHITELIST is not set +# CONFIG_USB_OTG_BLACKLIST_HUB is not set +CONFIG_USB_LEDS_TRIGGER_USBPORT=m +CONFIG_USB_AUTOSUSPEND_DELAY=2 +CONFIG_USB_MON=m + +# +# USB Host Controller Drivers +# +CONFIG_USB_C67X00_HCD=m +CONFIG_USB_XHCI_HCD=m +# CONFIG_USB_XHCI_DBGCAP is not set +CONFIG_USB_XHCI_PCI=m +CONFIG_USB_XHCI_PLATFORM=m +CONFIG_USB_EHCI_HCD=m +CONFIG_USB_EHCI_ROOT_HUB_TT=y +CONFIG_USB_EHCI_TT_NEWSCHED=y +CONFIG_USB_EHCI_PCI=m +CONFIG_USB_EHCI_FSL=m +CONFIG_USB_EHCI_HCD_PLATFORM=m +CONFIG_USB_OXU210HP_HCD=m +CONFIG_USB_ISP116X_HCD=m +CONFIG_USB_FOTG210_HCD=m +CONFIG_USB_MAX3421_HCD=m +CONFIG_USB_OHCI_HCD=m +CONFIG_USB_OHCI_HCD_PCI=m +# CONFIG_USB_OHCI_HCD_SSB is not set +CONFIG_USB_OHCI_HCD_PLATFORM=m +CONFIG_USB_UHCI_HCD=m +CONFIG_USB_U132_HCD=m +CONFIG_USB_SL811_HCD=m +# CONFIG_USB_SL811_HCD_ISO is not set +CONFIG_USB_SL811_CS=m +CONFIG_USB_R8A66597_HCD=m +CONFIG_USB_HCD_BCMA=m +CONFIG_USB_HCD_SSB=m +# CONFIG_USB_HCD_TEST_MODE is not set + +# +# USB Device Class drivers +# +CONFIG_USB_ACM=m +CONFIG_USB_PRINTER=m +CONFIG_USB_WDM=m +CONFIG_USB_TMC=m + +# +# NOTE: USB_STORAGE depends on SCSI but BLK_DEV_SD may +# + +# +# also be needed; see USB_STORAGE Help for more info +# +CONFIG_USB_STORAGE=m +# CONFIG_USB_STORAGE_DEBUG is not set +CONFIG_USB_STORAGE_REALTEK=m +CONFIG_REALTEK_AUTOPM=y +CONFIG_USB_STORAGE_DATAFAB=m +CONFIG_USB_STORAGE_FREECOM=m +CONFIG_USB_STORAGE_ISD200=m +CONFIG_USB_STORAGE_USBAT=m +CONFIG_USB_STORAGE_SDDR09=m +CONFIG_USB_STORAGE_SDDR55=m +CONFIG_USB_STORAGE_JUMPSHOT=m +CONFIG_USB_STORAGE_ALAUDA=m +CONFIG_USB_STORAGE_ONETOUCH=m +CONFIG_USB_STORAGE_KARMA=m +CONFIG_USB_STORAGE_CYPRESS_ATACB=m +CONFIG_USB_STORAGE_ENE_UB6250=m +CONFIG_USB_UAS=m + +# +# USB Imaging devices +# +CONFIG_USB_MDC800=m +CONFIG_USB_MICROTEK=m +CONFIG_USBIP_CORE=m +CONFIG_USBIP_VHCI_HCD=m +CONFIG_USBIP_VHCI_HC_PORTS=8 +CONFIG_USBIP_VHCI_NR_HCS=1 +CONFIG_USBIP_HOST=m +CONFIG_USBIP_VUDC=m +# CONFIG_USBIP_DEBUG is not set +CONFIG_USB_CDNS3=m +CONFIG_USB_CDNS3_GADGET=y +CONFIG_USB_CDNS3_HOST=y +CONFIG_USB_CDNS3_PCI_WRAP=m +CONFIG_USB_MUSB_HDRC=m +# CONFIG_USB_MUSB_HOST is not set +# CONFIG_USB_MUSB_GADGET is not set +CONFIG_USB_MUSB_DUAL_ROLE=y + +# +# Platform Glue Layer +# + +# +# MUSB DMA mode +# +# CONFIG_MUSB_PIO_ONLY is not set +CONFIG_USB_DWC3=m +CONFIG_USB_DWC3_ULPI=y +# CONFIG_USB_DWC3_HOST is not set +# CONFIG_USB_DWC3_GADGET is not set +CONFIG_USB_DWC3_DUAL_ROLE=y + +# +# Platform Glue Driver Support +# +CONFIG_USB_DWC3_PCI=m +CONFIG_USB_DWC3_HAPS=m +CONFIG_USB_DWC3_OF_SIMPLE=m +CONFIG_USB_DWC2=m +# CONFIG_USB_DWC2_HOST is not set + +# +# Gadget/Dual-role mode requires USB Gadget support to be enabled +# +# CONFIG_USB_DWC2_PERIPHERAL is not set +CONFIG_USB_DWC2_DUAL_ROLE=y +CONFIG_USB_DWC2_PCI=m +# CONFIG_USB_DWC2_DEBUG is not set +# CONFIG_USB_DWC2_TRACK_MISSED_SOFS is not set +CONFIG_USB_CHIPIDEA=m +CONFIG_USB_CHIPIDEA_OF=m +CONFIG_USB_CHIPIDEA_PCI=m +CONFIG_USB_CHIPIDEA_UDC=y +CONFIG_USB_CHIPIDEA_HOST=y +CONFIG_USB_ISP1760=m +CONFIG_USB_ISP1760_HCD=y +CONFIG_USB_ISP1761_UDC=y +# CONFIG_USB_ISP1760_HOST_ROLE is not set +# CONFIG_USB_ISP1760_GADGET_ROLE is not set +CONFIG_USB_ISP1760_DUAL_ROLE=y + +# +# USB port drivers +# +CONFIG_USB_USS720=m +CONFIG_USB_SERIAL=y +CONFIG_USB_SERIAL_CONSOLE=y +CONFIG_USB_SERIAL_GENERIC=y +CONFIG_USB_SERIAL_SIMPLE=m +CONFIG_USB_SERIAL_AIRCABLE=m +CONFIG_USB_SERIAL_ARK3116=m +CONFIG_USB_SERIAL_BELKIN=m +CONFIG_USB_SERIAL_CH341=m +CONFIG_USB_SERIAL_WHITEHEAT=m +CONFIG_USB_SERIAL_DIGI_ACCELEPORT=m +CONFIG_USB_SERIAL_CP210X=m +CONFIG_USB_SERIAL_CYPRESS_M8=m +CONFIG_USB_SERIAL_EMPEG=m +CONFIG_USB_SERIAL_FTDI_SIO=m +CONFIG_USB_SERIAL_VISOR=m +CONFIG_USB_SERIAL_IPAQ=m +CONFIG_USB_SERIAL_IR=m +CONFIG_USB_SERIAL_EDGEPORT=m +CONFIG_USB_SERIAL_EDGEPORT_TI=m +CONFIG_USB_SERIAL_F81232=m +CONFIG_USB_SERIAL_F8153X=m +CONFIG_USB_SERIAL_GARMIN=m +CONFIG_USB_SERIAL_IPW=m +CONFIG_USB_SERIAL_IUU=m +CONFIG_USB_SERIAL_KEYSPAN_PDA=m +CONFIG_USB_SERIAL_KEYSPAN=m +CONFIG_USB_SERIAL_KLSI=m +CONFIG_USB_SERIAL_KOBIL_SCT=m +CONFIG_USB_SERIAL_MCT_U232=m +CONFIG_USB_SERIAL_METRO=m +CONFIG_USB_SERIAL_MOS7720=m +CONFIG_USB_SERIAL_MOS7715_PARPORT=y +CONFIG_USB_SERIAL_MOS7840=m +CONFIG_USB_SERIAL_MXUPORT=m +CONFIG_USB_SERIAL_NAVMAN=m +CONFIG_USB_SERIAL_PL2303=m +CONFIG_USB_SERIAL_OTI6858=m +CONFIG_USB_SERIAL_QCAUX=m +CONFIG_USB_SERIAL_QUALCOMM=m +CONFIG_USB_SERIAL_SPCP8X5=m +CONFIG_USB_SERIAL_SAFE=m +# CONFIG_USB_SERIAL_SAFE_PADDED is not set +CONFIG_USB_SERIAL_SIERRAWIRELESS=m +CONFIG_USB_SERIAL_SYMBOL=m +CONFIG_USB_SERIAL_TI=m +CONFIG_USB_SERIAL_CYBERJACK=m +CONFIG_USB_SERIAL_XIRCOM=m +CONFIG_USB_SERIAL_WWAN=m +CONFIG_USB_SERIAL_OPTION=m +CONFIG_USB_SERIAL_OMNINET=m +CONFIG_USB_SERIAL_OPTICON=m +CONFIG_USB_SERIAL_XSENS_MT=m +CONFIG_USB_SERIAL_WISHBONE=m +CONFIG_USB_SERIAL_SSU100=m +CONFIG_USB_SERIAL_QT2=m +CONFIG_USB_SERIAL_UPD78F0730=m +CONFIG_USB_SERIAL_DEBUG=m + +# +# USB Miscellaneous drivers +# +CONFIG_USB_EMI62=m +CONFIG_USB_EMI26=m +CONFIG_USB_ADUTUX=m +CONFIG_USB_SEVSEG=m +CONFIG_USB_LEGOTOWER=m +CONFIG_USB_LCD=m +CONFIG_USB_CYPRESS_CY7C63=m +CONFIG_USB_CYTHERM=m +CONFIG_USB_IDMOUSE=m +CONFIG_USB_FTDI_ELAN=m +CONFIG_USB_APPLEDISPLAY=m +CONFIG_USB_SISUSBVGA=m +CONFIG_USB_SISUSBVGA_CON=y +CONFIG_USB_LD=m +CONFIG_USB_TRANCEVIBRATOR=m +CONFIG_USB_IOWARRIOR=m +CONFIG_USB_TEST=m +CONFIG_USB_EHSET_TEST_FIXTURE=m +CONFIG_USB_ISIGHTFW=m +CONFIG_USB_YUREX=m +CONFIG_USB_EZUSB_FX2=m +CONFIG_USB_HUB_USB251XB=m +CONFIG_USB_HSIC_USB3503=m +CONFIG_USB_HSIC_USB4604=m +CONFIG_USB_LINK_LAYER_TEST=m +CONFIG_USB_CHAOSKEY=m +CONFIG_USB_ATM=m +CONFIG_USB_SPEEDTOUCH=m +CONFIG_USB_CXACRU=m +CONFIG_USB_UEAGLEATM=m +CONFIG_USB_XUSBATM=m + +# +# USB Physical Layer drivers +# +CONFIG_USB_PHY=y +CONFIG_NOP_USB_XCEIV=m +CONFIG_USB_GPIO_VBUS=m +CONFIG_TAHVO_USB=m +# CONFIG_TAHVO_USB_HOST_BY_DEFAULT is not set +CONFIG_USB_ISP1301=m +# end of USB Physical Layer drivers + +CONFIG_USB_GADGET=m +# CONFIG_USB_GADGET_DEBUG is not set +# CONFIG_USB_GADGET_DEBUG_FILES is not set +# CONFIG_USB_GADGET_DEBUG_FS is not set +CONFIG_USB_GADGET_VBUS_DRAW=2 +CONFIG_USB_GADGET_STORAGE_NUM_BUFFERS=2 +CONFIG_U_SERIAL_CONSOLE=y + +# +# USB Peripheral Controller +# +CONFIG_USB_FOTG210_UDC=m +CONFIG_USB_GR_UDC=m +CONFIG_USB_R8A66597=m +CONFIG_USB_PXA27X=m +CONFIG_USB_MV_UDC=m +CONFIG_USB_MV_U3D=m +CONFIG_USB_SNP_CORE=m +CONFIG_USB_SNP_UDC_PLAT=m +CONFIG_USB_M66592=m +CONFIG_USB_BDC_UDC=m + +# +# Platform Support +# +CONFIG_USB_BDC_PCI=m +CONFIG_USB_AMD5536UDC=m +CONFIG_USB_NET2272=m +CONFIG_USB_NET2272_DMA=y +CONFIG_USB_NET2280=m +CONFIG_USB_GOKU=m +CONFIG_USB_EG20T=m +CONFIG_USB_GADGET_XILINX=m +CONFIG_USB_DUMMY_HCD=m +# end of USB Peripheral Controller + +CONFIG_USB_LIBCOMPOSITE=m +CONFIG_USB_F_ACM=m +CONFIG_USB_F_SS_LB=m +CONFIG_USB_U_SERIAL=m +CONFIG_USB_U_ETHER=m +CONFIG_USB_U_AUDIO=m +CONFIG_USB_F_SERIAL=m +CONFIG_USB_F_OBEX=m +CONFIG_USB_F_NCM=m +CONFIG_USB_F_ECM=m +CONFIG_USB_F_PHONET=m +CONFIG_USB_F_EEM=m +CONFIG_USB_F_SUBSET=m +CONFIG_USB_F_RNDIS=m +CONFIG_USB_F_MASS_STORAGE=m +CONFIG_USB_F_FS=m +CONFIG_USB_F_UAC1=m +CONFIG_USB_F_UAC1_LEGACY=m +CONFIG_USB_F_UAC2=m +CONFIG_USB_F_UVC=m +CONFIG_USB_F_MIDI=m +CONFIG_USB_F_HID=m +CONFIG_USB_F_PRINTER=m +CONFIG_USB_F_TCM=m +CONFIG_USB_CONFIGFS=m +CONFIG_USB_CONFIGFS_SERIAL=y +CONFIG_USB_CONFIGFS_ACM=y +CONFIG_USB_CONFIGFS_OBEX=y +CONFIG_USB_CONFIGFS_NCM=y +CONFIG_USB_CONFIGFS_ECM=y +CONFIG_USB_CONFIGFS_ECM_SUBSET=y +CONFIG_USB_CONFIGFS_RNDIS=y +CONFIG_USB_CONFIGFS_EEM=y +CONFIG_USB_CONFIGFS_PHONET=y +CONFIG_USB_CONFIGFS_MASS_STORAGE=y +CONFIG_USB_CONFIGFS_F_LB_SS=y +CONFIG_USB_CONFIGFS_F_FS=y +CONFIG_USB_CONFIGFS_F_UAC1=y +CONFIG_USB_CONFIGFS_F_UAC1_LEGACY=y +CONFIG_USB_CONFIGFS_F_UAC2=y +CONFIG_USB_CONFIGFS_F_MIDI=y +CONFIG_USB_CONFIGFS_F_HID=y +CONFIG_USB_CONFIGFS_F_UVC=y +CONFIG_USB_CONFIGFS_F_PRINTER=y +CONFIG_USB_CONFIGFS_F_TCM=y +CONFIG_USB_ZERO=m +CONFIG_USB_AUDIO=m +# CONFIG_GADGET_UAC1 is not set +CONFIG_USB_ETH=m +CONFIG_USB_ETH_RNDIS=y +CONFIG_USB_ETH_EEM=y +CONFIG_USB_G_NCM=m +CONFIG_USB_GADGETFS=m +CONFIG_USB_FUNCTIONFS=m +CONFIG_USB_FUNCTIONFS_ETH=y +CONFIG_USB_FUNCTIONFS_RNDIS=y +CONFIG_USB_FUNCTIONFS_GENERIC=y +CONFIG_USB_MASS_STORAGE=m +CONFIG_USB_GADGET_TARGET=m +CONFIG_USB_G_SERIAL=m +CONFIG_USB_MIDI_GADGET=m +CONFIG_USB_G_PRINTER=m +CONFIG_USB_CDC_COMPOSITE=m +CONFIG_USB_G_NOKIA=m +CONFIG_USB_G_ACM_MS=m +CONFIG_USB_G_MULTI=m +CONFIG_USB_G_MULTI_RNDIS=y +CONFIG_USB_G_MULTI_CDC=y +CONFIG_USB_G_HID=m +CONFIG_USB_G_DBGP=m +# CONFIG_USB_G_DBGP_PRINTK is not set +CONFIG_USB_G_DBGP_SERIAL=y +CONFIG_USB_G_WEBCAM=m +CONFIG_TYPEC=m +CONFIG_TYPEC_TCPM=m +CONFIG_TYPEC_TCPCI=m +CONFIG_TYPEC_RT1711H=m +CONFIG_TYPEC_FUSB302=m +CONFIG_TYPEC_WCOVE=m +CONFIG_TYPEC_UCSI=m +CONFIG_UCSI_CCG=m +CONFIG_UCSI_ACPI=m +CONFIG_TYPEC_HD3SS3220=m +CONFIG_TYPEC_TPS6598X=m + +# +# USB Type-C Multiplexer/DeMultiplexer Switch support +# +CONFIG_TYPEC_MUX_PI3USB30532=m +# end of USB Type-C Multiplexer/DeMultiplexer Switch support + +# +# USB Type-C Alternate Mode drivers +# +CONFIG_TYPEC_DP_ALTMODE=m +CONFIG_TYPEC_NVIDIA_ALTMODE=m +# end of USB Type-C Alternate Mode drivers + +CONFIG_USB_ROLE_SWITCH=m +CONFIG_USB_ROLES_INTEL_XHCI=m +CONFIG_MMC=m +CONFIG_PWRSEQ_EMMC=m +CONFIG_PWRSEQ_SD8787=m +CONFIG_PWRSEQ_SIMPLE=m +CONFIG_MMC_BLOCK=m +CONFIG_MMC_BLOCK_MINORS=8 +CONFIG_SDIO_UART=m +CONFIG_MMC_TEST=m + +# +# MMC/SD/SDIO Host Controller Drivers +# +# CONFIG_MMC_DEBUG is not set +CONFIG_MMC_SDHCI=m +CONFIG_MMC_SDHCI_IO_ACCESSORS=y +CONFIG_MMC_SDHCI_PCI=m +CONFIG_MMC_RICOH_MMC=y +CONFIG_MMC_SDHCI_ACPI=m +CONFIG_MMC_SDHCI_PLTFM=m +CONFIG_MMC_SDHCI_OF_ARASAN=m +CONFIG_MMC_SDHCI_OF_ASPEED=m +CONFIG_MMC_SDHCI_OF_AT91=m +CONFIG_MMC_SDHCI_OF_DWCMSHC=m +CONFIG_MMC_SDHCI_CADENCE=m +CONFIG_MMC_SDHCI_F_SDH30=m +CONFIG_MMC_SDHCI_MILBEAUT=m +CONFIG_MMC_WBSD=m +CONFIG_MMC_ALCOR=m +CONFIG_MMC_TIFM_SD=m +CONFIG_MMC_SPI=m +CONFIG_MMC_SDRICOH_CS=m +CONFIG_MMC_CB710=m +CONFIG_MMC_VIA_SDMMC=m +CONFIG_MMC_VUB300=m +CONFIG_MMC_USHC=m +CONFIG_MMC_USDHI6ROL0=m +CONFIG_MMC_REALTEK_PCI=m +CONFIG_MMC_REALTEK_USB=m +CONFIG_MMC_CQHCI=m +CONFIG_MMC_TOSHIBA_PCI=m +CONFIG_MMC_MTK=m +CONFIG_MMC_SDHCI_XENON=m +CONFIG_MMC_SDHCI_OMAP=m +CONFIG_MMC_SDHCI_AM654=m +CONFIG_MEMSTICK=m +# CONFIG_MEMSTICK_DEBUG is not set + +# +# MemoryStick drivers +# +# CONFIG_MEMSTICK_UNSAFE_RESUME is not set +CONFIG_MSPRO_BLOCK=m +CONFIG_MS_BLOCK=m + +# +# MemoryStick Host Controller Drivers +# +CONFIG_MEMSTICK_TIFM_MS=m +CONFIG_MEMSTICK_JMICRON_38X=m +CONFIG_MEMSTICK_R592=m +CONFIG_MEMSTICK_REALTEK_PCI=m +CONFIG_MEMSTICK_REALTEK_USB=m +CONFIG_NEW_LEDS=y +CONFIG_LEDS_CLASS=y +CONFIG_LEDS_CLASS_FLASH=m +CONFIG_LEDS_BRIGHTNESS_HW_CHANGED=y + +# +# LED drivers +# +CONFIG_LEDS_88PM860X=m +CONFIG_LEDS_AAT1290=m +CONFIG_LEDS_AN30259A=m +CONFIG_LEDS_APU=m +CONFIG_LEDS_AS3645A=m +CONFIG_LEDS_BCM6328=m +CONFIG_LEDS_BCM6358=m +CONFIG_LEDS_CPCAP=m +CONFIG_LEDS_CR0014114=m +CONFIG_LEDS_EL15203000=m +CONFIG_LEDS_LM3530=m +CONFIG_LEDS_LM3532=m +CONFIG_LEDS_LM3533=m +CONFIG_LEDS_LM3642=m +CONFIG_LEDS_LM3692X=m +CONFIG_LEDS_LM3601X=m +CONFIG_LEDS_MT6323=m +CONFIG_LEDS_PCA9532=m +CONFIG_LEDS_PCA9532_GPIO=y +CONFIG_LEDS_GPIO=m +CONFIG_LEDS_LP3944=m +CONFIG_LEDS_LP3952=m +# CONFIG_LEDS_LP5521 is not set +# CONFIG_LEDS_LP5523 is not set +# CONFIG_LEDS_LP5562 is not set +# CONFIG_LEDS_LP8501 is not set +CONFIG_LEDS_LP8788=m +CONFIG_LEDS_LP8860=m +CONFIG_LEDS_CLEVO_MAIL=m +CONFIG_LEDS_PCA955X=m +CONFIG_LEDS_PCA955X_GPIO=y +CONFIG_LEDS_PCA963X=m +CONFIG_LEDS_WM831X_STATUS=m +CONFIG_LEDS_WM8350=m +CONFIG_LEDS_DA903X=m +CONFIG_LEDS_DA9052=m +CONFIG_LEDS_DAC124S085=m +CONFIG_LEDS_PWM=m +CONFIG_LEDS_REGULATOR=m +CONFIG_LEDS_BD2802=m +CONFIG_LEDS_INTEL_SS4200=m +CONFIG_LEDS_LT3593=m +CONFIG_LEDS_ADP5520=m +CONFIG_LEDS_MC13783=m +CONFIG_LEDS_TCA6507=m +CONFIG_LEDS_TLC591XX=m +CONFIG_LEDS_MAX77650=m +CONFIG_LEDS_MAX77693=m +CONFIG_LEDS_MAX8997=m +CONFIG_LEDS_LM355x=m +CONFIG_LEDS_MENF21BMC=m +CONFIG_LEDS_KTD2692=m +CONFIG_LEDS_IS31FL319X=m +CONFIG_LEDS_IS31FL32XX=m + +# +# LED driver for blink(1) USB RGB LED is under Special HID drivers (HID_THINGM) +# +CONFIG_LEDS_BLINKM=m +CONFIG_LEDS_SYSCON=y +CONFIG_LEDS_MLXCPLD=m +CONFIG_LEDS_MLXREG=m +CONFIG_LEDS_USER=m +CONFIG_LEDS_NIC78BX=m +CONFIG_LEDS_SPI_BYTE=m +CONFIG_LEDS_TI_LMU_COMMON=m +CONFIG_LEDS_LM3697=m +CONFIG_LEDS_LM36274=m + +# +# LED Triggers +# +CONFIG_LEDS_TRIGGERS=y +CONFIG_LEDS_TRIGGER_TIMER=m +CONFIG_LEDS_TRIGGER_ONESHOT=m +CONFIG_LEDS_TRIGGER_DISK=y +CONFIG_LEDS_TRIGGER_MTD=y +CONFIG_LEDS_TRIGGER_HEARTBEAT=m +CONFIG_LEDS_TRIGGER_BACKLIGHT=m +CONFIG_LEDS_TRIGGER_CPU=y +CONFIG_LEDS_TRIGGER_ACTIVITY=m +CONFIG_LEDS_TRIGGER_GPIO=m +CONFIG_LEDS_TRIGGER_DEFAULT_ON=m + +# +# iptables trigger is under Netfilter config (LED target) +# +CONFIG_LEDS_TRIGGER_TRANSIENT=m +CONFIG_LEDS_TRIGGER_CAMERA=m +CONFIG_LEDS_TRIGGER_PANIC=y +CONFIG_LEDS_TRIGGER_NETDEV=m +CONFIG_LEDS_TRIGGER_PATTERN=m +CONFIG_LEDS_TRIGGER_AUDIO=m +CONFIG_ACCESSIBILITY=y +CONFIG_A11Y_BRAILLE_CONSOLE=y +CONFIG_INFINIBAND=m +CONFIG_INFINIBAND_USER_MAD=m +CONFIG_INFINIBAND_USER_ACCESS=m +# CONFIG_INFINIBAND_EXP_LEGACY_VERBS_NEW_UAPI is not set +CONFIG_INFINIBAND_USER_MEM=y +CONFIG_INFINIBAND_ON_DEMAND_PAGING=y +CONFIG_INFINIBAND_ADDR_TRANS=y +CONFIG_INFINIBAND_ADDR_TRANS_CONFIGFS=y +CONFIG_INFINIBAND_MTHCA=m +CONFIG_INFINIBAND_MTHCA_DEBUG=y +CONFIG_INFINIBAND_QIB=m +CONFIG_INFINIBAND_QIB_DCA=y +CONFIG_INFINIBAND_CXGB4=m +CONFIG_INFINIBAND_EFA=m +CONFIG_INFINIBAND_I40IW=m +CONFIG_MLX4_INFINIBAND=m +CONFIG_MLX5_INFINIBAND=m +CONFIG_INFINIBAND_OCRDMA=m +CONFIG_INFINIBAND_VMWARE_PVRDMA=m +CONFIG_INFINIBAND_USNIC=m +CONFIG_INFINIBAND_BNXT_RE=m +CONFIG_INFINIBAND_HFI1=m +# CONFIG_HFI1_DEBUG_SDMA_ORDER is not set +# CONFIG_SDMA_VERBOSITY is not set +CONFIG_INFINIBAND_QEDR=m +CONFIG_INFINIBAND_RDMAVT=m +CONFIG_RDMA_RXE=m +CONFIG_RDMA_SIW=m +CONFIG_INFINIBAND_IPOIB=m +CONFIG_INFINIBAND_IPOIB_CM=y +CONFIG_INFINIBAND_IPOIB_DEBUG=y +# CONFIG_INFINIBAND_IPOIB_DEBUG_DATA is not set +CONFIG_INFINIBAND_SRP=m +CONFIG_INFINIBAND_SRPT=m +CONFIG_INFINIBAND_ISER=m +CONFIG_INFINIBAND_ISERT=m +CONFIG_INFINIBAND_OPA_VNIC=m +CONFIG_EDAC_ATOMIC_SCRUB=y +CONFIG_EDAC_SUPPORT=y +CONFIG_EDAC=y +CONFIG_EDAC_LEGACY_SYSFS=y +# CONFIG_EDAC_DEBUG is not set +CONFIG_EDAC_DECODE_MCE=m +CONFIG_EDAC_GHES=y +CONFIG_EDAC_AMD64=m +# CONFIG_EDAC_AMD64_ERROR_INJECTION is not set +CONFIG_EDAC_E752X=m +CONFIG_EDAC_I82975X=m +CONFIG_EDAC_I3000=m +CONFIG_EDAC_I3200=m +CONFIG_EDAC_IE31200=m +CONFIG_EDAC_X38=m +CONFIG_EDAC_I5400=m +CONFIG_EDAC_I7CORE=m +CONFIG_EDAC_I5000=m +CONFIG_EDAC_I5100=m +CONFIG_EDAC_I7300=m +CONFIG_EDAC_SBRIDGE=m +CONFIG_EDAC_SKX=m +CONFIG_EDAC_I10NM=m +CONFIG_EDAC_PND2=m +CONFIG_RTC_LIB=y +CONFIG_RTC_MC146818_LIB=y +CONFIG_RTC_CLASS=y +CONFIG_RTC_HCTOSYS=y +CONFIG_RTC_HCTOSYS_DEVICE="rtc0" +CONFIG_RTC_SYSTOHC=y +CONFIG_RTC_SYSTOHC_DEVICE="rtc0" +# CONFIG_RTC_DEBUG is not set +CONFIG_RTC_NVMEM=y + +# +# RTC interfaces +# +CONFIG_RTC_INTF_SYSFS=y +CONFIG_RTC_INTF_PROC=y +CONFIG_RTC_INTF_DEV=y +CONFIG_RTC_INTF_DEV_UIE_EMUL=y +# CONFIG_RTC_DRV_TEST is not set + +# +# I2C RTC drivers +# +CONFIG_RTC_DRV_88PM860X=m +CONFIG_RTC_DRV_88PM80X=m +CONFIG_RTC_DRV_ABB5ZES3=m +CONFIG_RTC_DRV_ABEOZ9=m +CONFIG_RTC_DRV_ABX80X=m +CONFIG_RTC_DRV_AS3722=m +CONFIG_RTC_DRV_DS1307=m +CONFIG_RTC_DRV_DS1307_CENTURY=y +CONFIG_RTC_DRV_DS1374=m +CONFIG_RTC_DRV_DS1374_WDT=y +CONFIG_RTC_DRV_DS1672=m +CONFIG_RTC_DRV_HYM8563=m +CONFIG_RTC_DRV_LP8788=m +CONFIG_RTC_DRV_MAX6900=m +CONFIG_RTC_DRV_MAX8907=m +CONFIG_RTC_DRV_MAX8925=m +CONFIG_RTC_DRV_MAX8998=m +CONFIG_RTC_DRV_MAX8997=m +CONFIG_RTC_DRV_MAX77686=m +CONFIG_RTC_DRV_RK808=m +CONFIG_RTC_DRV_RS5C372=m +CONFIG_RTC_DRV_ISL1208=m +CONFIG_RTC_DRV_ISL12022=m +CONFIG_RTC_DRV_ISL12026=m +CONFIG_RTC_DRV_X1205=m +CONFIG_RTC_DRV_PCF8523=m +CONFIG_RTC_DRV_PCF85063=m +CONFIG_RTC_DRV_PCF85363=m +CONFIG_RTC_DRV_PCF8563=m +CONFIG_RTC_DRV_PCF8583=m +CONFIG_RTC_DRV_M41T80=m +CONFIG_RTC_DRV_M41T80_WDT=y +CONFIG_RTC_DRV_BD70528=m +CONFIG_RTC_DRV_BQ32K=m +CONFIG_RTC_DRV_TWL4030=m +CONFIG_RTC_DRV_PALMAS=m +CONFIG_RTC_DRV_TPS6586X=m +CONFIG_RTC_DRV_TPS65910=m +CONFIG_RTC_DRV_TPS80031=m +CONFIG_RTC_DRV_RC5T583=m +CONFIG_RTC_DRV_S35390A=m +CONFIG_RTC_DRV_FM3130=m +CONFIG_RTC_DRV_RX8010=m +CONFIG_RTC_DRV_RX8581=m +CONFIG_RTC_DRV_RX8025=m +CONFIG_RTC_DRV_EM3027=m +CONFIG_RTC_DRV_RV3028=m +CONFIG_RTC_DRV_RV8803=m +CONFIG_RTC_DRV_S5M=m +CONFIG_RTC_DRV_SD3078=m + +# +# SPI RTC drivers +# +CONFIG_RTC_DRV_M41T93=m +CONFIG_RTC_DRV_M41T94=m +CONFIG_RTC_DRV_DS1302=m +CONFIG_RTC_DRV_DS1305=m +CONFIG_RTC_DRV_DS1343=m +CONFIG_RTC_DRV_DS1347=m +CONFIG_RTC_DRV_DS1390=m +CONFIG_RTC_DRV_MAX6916=m +CONFIG_RTC_DRV_R9701=m +CONFIG_RTC_DRV_RX4581=m +CONFIG_RTC_DRV_RX6110=m +CONFIG_RTC_DRV_RS5C348=m +CONFIG_RTC_DRV_MAX6902=m +CONFIG_RTC_DRV_PCF2123=m +CONFIG_RTC_DRV_MCP795=m +CONFIG_RTC_I2C_AND_SPI=y + +# +# SPI and I2C RTC drivers +# +CONFIG_RTC_DRV_DS3232=m +CONFIG_RTC_DRV_DS3232_HWMON=y +CONFIG_RTC_DRV_PCF2127=m +CONFIG_RTC_DRV_RV3029C2=m +CONFIG_RTC_DRV_RV3029_HWMON=y + +# +# Platform RTC drivers +# +CONFIG_RTC_DRV_CMOS=y +CONFIG_RTC_DRV_DS1286=m +CONFIG_RTC_DRV_DS1511=m +CONFIG_RTC_DRV_DS1553=m +CONFIG_RTC_DRV_DS1685_FAMILY=m +CONFIG_RTC_DRV_DS1685=y +# CONFIG_RTC_DRV_DS1689 is not set +# CONFIG_RTC_DRV_DS17285 is not set +# CONFIG_RTC_DRV_DS17485 is not set +# CONFIG_RTC_DRV_DS17885 is not set +CONFIG_RTC_DRV_DS1742=m +CONFIG_RTC_DRV_DS2404=m +CONFIG_RTC_DRV_DA9052=m +CONFIG_RTC_DRV_DA9055=m +CONFIG_RTC_DRV_DA9063=m +CONFIG_RTC_DRV_STK17TA8=m +CONFIG_RTC_DRV_M48T86=m +CONFIG_RTC_DRV_M48T35=m +CONFIG_RTC_DRV_M48T59=m +CONFIG_RTC_DRV_MSM6242=m +CONFIG_RTC_DRV_BQ4802=m +CONFIG_RTC_DRV_RP5C01=m +CONFIG_RTC_DRV_V3020=m +CONFIG_RTC_DRV_WM831X=m +CONFIG_RTC_DRV_WM8350=m +CONFIG_RTC_DRV_PCF50633=m +CONFIG_RTC_DRV_AB3100=m +CONFIG_RTC_DRV_ZYNQMP=m +CONFIG_RTC_DRV_CROS_EC=m + +# +# on-CPU RTC drivers +# +CONFIG_RTC_DRV_CADENCE=m +CONFIG_RTC_DRV_FTRTC010=m +CONFIG_RTC_DRV_PCAP=m +CONFIG_RTC_DRV_MC13XXX=m +CONFIG_RTC_DRV_SNVS=m +CONFIG_RTC_DRV_MT6397=m +CONFIG_RTC_DRV_R7301=m +CONFIG_RTC_DRV_CPCAP=m + +# +# HID Sensor RTC drivers +# +CONFIG_RTC_DRV_HID_SENSOR_TIME=m +CONFIG_RTC_DRV_WILCO_EC=m +CONFIG_DMADEVICES=y +# CONFIG_DMADEVICES_DEBUG is not set + +# +# DMA Devices +# +CONFIG_DMA_ENGINE=y +CONFIG_DMA_VIRTUAL_CHANNELS=y +CONFIG_DMA_ACPI=y +CONFIG_DMA_OF=y +CONFIG_ALTERA_MSGDMA=m +CONFIG_DW_AXI_DMAC=m +CONFIG_FSL_EDMA=m +CONFIG_INTEL_IDMA64=m +CONFIG_INTEL_IOATDMA=m +CONFIG_INTEL_MIC_X100_DMA=m +CONFIG_QCOM_HIDMA_MGMT=m +CONFIG_QCOM_HIDMA=m +CONFIG_DW_DMAC_CORE=y +CONFIG_DW_DMAC=y +CONFIG_DW_DMAC_PCI=y +CONFIG_DW_EDMA=m +CONFIG_DW_EDMA_PCIE=m +CONFIG_HSU_DMA=y +CONFIG_SF_PDMA=m + +# +# DMA Clients +# +CONFIG_ASYNC_TX_DMA=y +# CONFIG_DMATEST is not set +CONFIG_DMA_ENGINE_RAID=y + +# +# DMABUF options +# +CONFIG_SYNC_FILE=y +# CONFIG_SW_SYNC is not set +CONFIG_UDMABUF=y +# CONFIG_DMABUF_SELFTESTS is not set +# end of DMABUF options + +CONFIG_DCA=m +CONFIG_AUXDISPLAY=y +CONFIG_HD44780=m +CONFIG_KS0108=m +CONFIG_KS0108_PORT=0x378 +CONFIG_KS0108_DELAY=2 +CONFIG_CFAG12864B=m +CONFIG_CFAG12864B_RATE=20 +CONFIG_IMG_ASCII_LCD=m +CONFIG_HT16K33=m +CONFIG_PARPORT_PANEL=m +CONFIG_PANEL_PARPORT=0 +CONFIG_PANEL_PROFILE=5 +# CONFIG_PANEL_CHANGE_MESSAGE is not set +# CONFIG_CHARLCD_BL_OFF is not set +# CONFIG_CHARLCD_BL_ON is not set +CONFIG_CHARLCD_BL_FLASH=y +CONFIG_PANEL=m +CONFIG_CHARLCD=m +CONFIG_UIO=m +CONFIG_UIO_CIF=m +CONFIG_UIO_PDRV_GENIRQ=m +CONFIG_UIO_DMEM_GENIRQ=m +CONFIG_UIO_AEC=m +CONFIG_UIO_SERCOS3=m +CONFIG_UIO_PCI_GENERIC=m +CONFIG_UIO_NETX=m +CONFIG_UIO_PRUSS=m +CONFIG_UIO_MF624=m +CONFIG_UIO_HV_GENERIC=m +CONFIG_VFIO_IOMMU_TYPE1=m +CONFIG_VFIO_VIRQFD=m +CONFIG_VFIO=m +# CONFIG_VFIO_NOIOMMU is not set +CONFIG_VFIO_PCI=m +CONFIG_VFIO_PCI_VGA=y +CONFIG_VFIO_PCI_MMAP=y +CONFIG_VFIO_PCI_INTX=y +CONFIG_VFIO_PCI_IGD=y +CONFIG_VFIO_MDEV=m +CONFIG_VFIO_MDEV_DEVICE=m +CONFIG_IRQ_BYPASS_MANAGER=m +CONFIG_VIRT_DRIVERS=y +CONFIG_VBOXGUEST=m +CONFIG_VIRTIO=y +CONFIG_VIRTIO_MENU=y +CONFIG_VIRTIO_PCI=m +CONFIG_VIRTIO_PCI_LEGACY=y +CONFIG_VIRTIO_PMEM=m +CONFIG_VIRTIO_BALLOON=m +CONFIG_VIRTIO_INPUT=m +CONFIG_VIRTIO_MMIO=m +CONFIG_VIRTIO_MMIO_CMDLINE_DEVICES=y + +# +# Microsoft Hyper-V guest support +# +CONFIG_HYPERV=m +CONFIG_HYPERV_TIMER=y +CONFIG_HYPERV_UTILS=m +CONFIG_HYPERV_BALLOON=m +# end of Microsoft Hyper-V guest support + +# +# Xen driver support +# +CONFIG_XEN_BALLOON=y +CONFIG_XEN_BALLOON_MEMORY_HOTPLUG=y +CONFIG_XEN_BALLOON_MEMORY_HOTPLUG_LIMIT=512 +CONFIG_XEN_SCRUB_PAGES_DEFAULT=y +CONFIG_XEN_DEV_EVTCHN=m +CONFIG_XEN_BACKEND=y +CONFIG_XENFS=m +CONFIG_XEN_COMPAT_XENFS=y +CONFIG_XEN_SYS_HYPERVISOR=y +CONFIG_XEN_XENBUS_FRONTEND=y +CONFIG_XEN_GNTDEV=m +CONFIG_XEN_GNTDEV_DMABUF=y +CONFIG_XEN_GRANT_DEV_ALLOC=m +CONFIG_XEN_GRANT_DMA_ALLOC=y +CONFIG_SWIOTLB_XEN=y +CONFIG_XEN_PCIDEV_BACKEND=m +CONFIG_XEN_PVCALLS_FRONTEND=m +CONFIG_XEN_PVCALLS_BACKEND=y +CONFIG_XEN_SCSI_BACKEND=m +CONFIG_XEN_PRIVCMD=m +CONFIG_XEN_ACPI_PROCESSOR=m +CONFIG_XEN_MCE_LOG=y +CONFIG_XEN_HAVE_PVMMU=y +CONFIG_XEN_EFI=y +CONFIG_XEN_AUTO_XLATE=y +CONFIG_XEN_ACPI=y +CONFIG_XEN_SYMS=y +CONFIG_XEN_HAVE_VPMU=y +CONFIG_XEN_FRONT_PGDIR_SHBUF=m +# end of Xen driver support + +# CONFIG_GREYBUS is not set +CONFIG_STAGING=y +CONFIG_PRISM2_USB=m +CONFIG_COMEDI=m +# CONFIG_COMEDI_DEBUG is not set +CONFIG_COMEDI_DEFAULT_BUF_SIZE_KB=2048 +CONFIG_COMEDI_DEFAULT_BUF_MAXSIZE_KB=20480 +CONFIG_COMEDI_MISC_DRIVERS=y +CONFIG_COMEDI_BOND=m +CONFIG_COMEDI_TEST=m +CONFIG_COMEDI_PARPORT=m +# CONFIG_COMEDI_ISA_DRIVERS is not set +CONFIG_COMEDI_PCI_DRIVERS=m +CONFIG_COMEDI_8255_PCI=m +CONFIG_COMEDI_ADDI_WATCHDOG=m +CONFIG_COMEDI_ADDI_APCI_1032=m +CONFIG_COMEDI_ADDI_APCI_1500=m +CONFIG_COMEDI_ADDI_APCI_1516=m +CONFIG_COMEDI_ADDI_APCI_1564=m +CONFIG_COMEDI_ADDI_APCI_16XX=m +CONFIG_COMEDI_ADDI_APCI_2032=m +CONFIG_COMEDI_ADDI_APCI_2200=m +CONFIG_COMEDI_ADDI_APCI_3120=m +CONFIG_COMEDI_ADDI_APCI_3501=m +CONFIG_COMEDI_ADDI_APCI_3XXX=m +CONFIG_COMEDI_ADL_PCI6208=m +CONFIG_COMEDI_ADL_PCI7X3X=m +CONFIG_COMEDI_ADL_PCI8164=m +CONFIG_COMEDI_ADL_PCI9111=m +CONFIG_COMEDI_ADL_PCI9118=m +CONFIG_COMEDI_ADV_PCI1710=m +CONFIG_COMEDI_ADV_PCI1720=m +CONFIG_COMEDI_ADV_PCI1723=m +CONFIG_COMEDI_ADV_PCI1724=m +CONFIG_COMEDI_ADV_PCI1760=m +CONFIG_COMEDI_ADV_PCI_DIO=m +CONFIG_COMEDI_AMPLC_DIO200_PCI=m +CONFIG_COMEDI_AMPLC_PC236_PCI=m +CONFIG_COMEDI_AMPLC_PC263_PCI=m +CONFIG_COMEDI_AMPLC_PCI224=m +CONFIG_COMEDI_AMPLC_PCI230=m +CONFIG_COMEDI_CONTEC_PCI_DIO=m +CONFIG_COMEDI_DAS08_PCI=m +CONFIG_COMEDI_DT3000=m +CONFIG_COMEDI_DYNA_PCI10XX=m +CONFIG_COMEDI_GSC_HPDI=m +CONFIG_COMEDI_MF6X4=m +CONFIG_COMEDI_ICP_MULTI=m +CONFIG_COMEDI_DAQBOARD2000=m +CONFIG_COMEDI_JR3_PCI=m +CONFIG_COMEDI_KE_COUNTER=m +CONFIG_COMEDI_CB_PCIDAS64=m +CONFIG_COMEDI_CB_PCIDAS=m +CONFIG_COMEDI_CB_PCIDDA=m +CONFIG_COMEDI_CB_PCIMDAS=m +CONFIG_COMEDI_CB_PCIMDDA=m +CONFIG_COMEDI_ME4000=m +CONFIG_COMEDI_ME_DAQ=m +CONFIG_COMEDI_NI_6527=m +CONFIG_COMEDI_NI_65XX=m +CONFIG_COMEDI_NI_660X=m +CONFIG_COMEDI_NI_670X=m +CONFIG_COMEDI_NI_LABPC_PCI=m +CONFIG_COMEDI_NI_PCIDIO=m +CONFIG_COMEDI_NI_PCIMIO=m +CONFIG_COMEDI_RTD520=m +CONFIG_COMEDI_S626=m +CONFIG_COMEDI_MITE=m +CONFIG_COMEDI_NI_TIOCMD=m +CONFIG_COMEDI_PCMCIA_DRIVERS=m +CONFIG_COMEDI_CB_DAS16_CS=m +CONFIG_COMEDI_DAS08_CS=m +CONFIG_COMEDI_NI_DAQ_700_CS=m +CONFIG_COMEDI_NI_DAQ_DIO24_CS=m +CONFIG_COMEDI_NI_LABPC_CS=m +CONFIG_COMEDI_NI_MIO_CS=m +CONFIG_COMEDI_QUATECH_DAQP_CS=m +CONFIG_COMEDI_USB_DRIVERS=m +CONFIG_COMEDI_DT9812=m +CONFIG_COMEDI_NI_USB6501=m +CONFIG_COMEDI_USBDUX=m +CONFIG_COMEDI_USBDUXFAST=m +CONFIG_COMEDI_USBDUXSIGMA=m +CONFIG_COMEDI_VMK80XX=m +CONFIG_COMEDI_8254=m +CONFIG_COMEDI_8255=m +CONFIG_COMEDI_8255_SA=m +CONFIG_COMEDI_KCOMEDILIB=m +CONFIG_COMEDI_AMPLC_DIO200=m +CONFIG_COMEDI_AMPLC_PC236=m +CONFIG_COMEDI_DAS08=m +CONFIG_COMEDI_NI_LABPC=m +CONFIG_COMEDI_NI_TIO=m +CONFIG_COMEDI_NI_ROUTING=m +CONFIG_RTL8192U=m +CONFIG_RTLLIB=m +CONFIG_RTLLIB_CRYPTO_CCMP=m +CONFIG_RTLLIB_CRYPTO_TKIP=m +CONFIG_RTLLIB_CRYPTO_WEP=m +CONFIG_RTL8192E=m +CONFIG_RTL8723BS=m +CONFIG_R8712U=m +CONFIG_R8188EU=m +CONFIG_88EU_AP_MODE=y +CONFIG_RTS5208=m +CONFIG_VT6655=m +CONFIG_VT6656=m + +# +# IIO staging drivers +# + +# +# Accelerometers +# +CONFIG_ADIS16203=m +CONFIG_ADIS16240=m +# end of Accelerometers + +# +# Analog to digital converters +# +CONFIG_AD7816=m +CONFIG_AD7192=m +CONFIG_AD7280=m +# end of Analog to digital converters + +# +# Analog digital bi-direction converters +# +CONFIG_ADT7316=m +CONFIG_ADT7316_SPI=m +CONFIG_ADT7316_I2C=m +# end of Analog digital bi-direction converters + +# +# Capacitance to digital converters +# +CONFIG_AD7150=m +CONFIG_AD7746=m +# end of Capacitance to digital converters + +# +# Direct Digital Synthesis +# +CONFIG_AD9832=m +CONFIG_AD9834=m +# end of Direct Digital Synthesis + +# +# Network Analyzer, Impedance Converters +# +CONFIG_AD5933=m +# end of Network Analyzer, Impedance Converters + +# +# Active energy metering IC +# +CONFIG_ADE7854=m +CONFIG_ADE7854_I2C=m +CONFIG_ADE7854_SPI=m +# end of Active energy metering IC + +# +# Resolver to digital converters +# +CONFIG_AD2S1210=m +# end of Resolver to digital converters +# end of IIO staging drivers + +# CONFIG_FB_SM750 is not set + +# +# Speakup console speech +# +CONFIG_SPEAKUP=m +CONFIG_SPEAKUP_SYNTH_ACNTSA=m +CONFIG_SPEAKUP_SYNTH_APOLLO=m +CONFIG_SPEAKUP_SYNTH_AUDPTR=m +CONFIG_SPEAKUP_SYNTH_BNS=m +CONFIG_SPEAKUP_SYNTH_DECTLK=m +CONFIG_SPEAKUP_SYNTH_DECEXT=m +CONFIG_SPEAKUP_SYNTH_LTLK=m +CONFIG_SPEAKUP_SYNTH_SOFT=m +CONFIG_SPEAKUP_SYNTH_SPKOUT=m +CONFIG_SPEAKUP_SYNTH_TXPRT=m +CONFIG_SPEAKUP_SYNTH_DUMMY=m +# end of Speakup console speech + +CONFIG_STAGING_MEDIA=y +CONFIG_VIDEO_IPU3_IMGU=m + +# +# soc_camera sensor drivers +# + +# +# Android +# +# end of Android + +CONFIG_STAGING_BOARD=y +CONFIG_LTE_GDM724X=m +CONFIG_FIREWIRE_SERIAL=m +CONFIG_FWTTY_MAX_TOTAL_PORTS=64 +CONFIG_FWTTY_MAX_CARD_PORTS=32 +CONFIG_GS_FPGABOOT=m +CONFIG_UNISYSSPAR=y +CONFIG_UNISYS_VISORNIC=m +CONFIG_UNISYS_VISORINPUT=m +CONFIG_UNISYS_VISORHBA=m +CONFIG_COMMON_CLK_XLNX_CLKWZRD=m +# CONFIG_FB_TFT is not set +CONFIG_WILC1000=m +CONFIG_WILC1000_SDIO=m +CONFIG_WILC1000_SPI=m +# CONFIG_WILC1000_HW_OOB_INTR is not set +CONFIG_MOST=m +CONFIG_MOST_CDEV=m +CONFIG_MOST_NET=m +CONFIG_MOST_SOUND=m +CONFIG_MOST_VIDEO=m +CONFIG_MOST_DIM2=m +CONFIG_MOST_I2C=m +CONFIG_MOST_USB=m +CONFIG_KS7010=m +CONFIG_PI433=m + +# +# Gasket devices +# +CONFIG_STAGING_GASKET_FRAMEWORK=m +CONFIG_STAGING_APEX_DRIVER=m +# end of Gasket devices + +CONFIG_XIL_AXIS_FIFO=m +CONFIG_FIELDBUS_DEV=m +CONFIG_HMS_ANYBUSS_BUS=m +CONFIG_ARCX_ANYBUS_CONTROLLER=m +CONFIG_HMS_PROFINET=m +CONFIG_KPC2000=y +CONFIG_KPC2000_CORE=m +CONFIG_KPC2000_SPI=m +CONFIG_KPC2000_I2C=m +CONFIG_KPC2000_DMA=m + +# +# ISDN CAPI drivers +# +CONFIG_CAPI_AVM=y +CONFIG_ISDN_DRV_AVMB1_B1PCI=m +CONFIG_ISDN_DRV_AVMB1_B1PCIV4=y +CONFIG_ISDN_DRV_AVMB1_B1PCMCIA=m +CONFIG_ISDN_DRV_AVMB1_AVM_CS=m +CONFIG_ISDN_DRV_AVMB1_T1PCI=m +CONFIG_ISDN_DRV_AVMB1_C4=m +CONFIG_ISDN_DRV_GIGASET=m +CONFIG_GIGASET_CAPI=y +CONFIG_GIGASET_BASE=m +CONFIG_GIGASET_M105=m +CONFIG_GIGASET_M101=m +# CONFIG_GIGASET_DEBUG is not set +CONFIG_HYSDN=m +CONFIG_HYSDN_CAPI=y +# end of ISDN CAPI drivers + +CONFIG_USB_WUSB=m +CONFIG_USB_WUSB_CBAF=m +# CONFIG_USB_WUSB_CBAF_DEBUG is not set +CONFIG_USB_WHCI_HCD=m +CONFIG_USB_HWA_HCD=m +CONFIG_UWB=m +CONFIG_UWB_HWA=m +CONFIG_UWB_WHCI=m +CONFIG_UWB_I1480U=m +CONFIG_EXFAT_FS=m +CONFIG_EXFAT_DISCARD=y +# CONFIG_EXFAT_DELAYED_SYNC is not set +# CONFIG_EXFAT_KERNEL_DEBUG is not set +# CONFIG_EXFAT_DEBUG_MSG is not set +CONFIG_EXFAT_DEFAULT_CODEPAGE=437 +CONFIG_EXFAT_DEFAULT_IOCHARSET="utf8" +CONFIG_QLGE=m +CONFIG_NET_VENDOR_HP=y +CONFIG_HP100=m +CONFIG_WFX=m +CONFIG_X86_PLATFORM_DEVICES=y +CONFIG_ACER_WMI=m +CONFIG_ACER_WIRELESS=m +CONFIG_ACERHDF=m +CONFIG_ALIENWARE_WMI=m +CONFIG_ASUS_LAPTOP=m +CONFIG_DCDBAS=m +CONFIG_DELL_SMBIOS=m +CONFIG_DELL_SMBIOS_WMI=y +CONFIG_DELL_SMBIOS_SMM=y +CONFIG_DELL_LAPTOP=m +CONFIG_DELL_WMI=m +CONFIG_DELL_WMI_DESCRIPTOR=m +CONFIG_DELL_WMI_AIO=m +CONFIG_DELL_WMI_LED=m +CONFIG_DELL_SMO8800=m +CONFIG_DELL_RBTN=m +# CONFIG_DELL_RBU is not set +CONFIG_FUJITSU_LAPTOP=m +CONFIG_FUJITSU_TABLET=m +CONFIG_AMILO_RFKILL=m +CONFIG_GPD_POCKET_FAN=m +CONFIG_HP_ACCEL=m +CONFIG_HP_WIRELESS=m +CONFIG_HP_WMI=m +CONFIG_LG_LAPTOP=m +CONFIG_MSI_LAPTOP=m +CONFIG_PANASONIC_LAPTOP=m +CONFIG_COMPAL_LAPTOP=m +CONFIG_SONY_LAPTOP=m +CONFIG_SONYPI_COMPAT=y +CONFIG_IDEAPAD_LAPTOP=m +CONFIG_SURFACE3_WMI=m +CONFIG_THINKPAD_ACPI=m +CONFIG_THINKPAD_ACPI_ALSA_SUPPORT=y +# CONFIG_THINKPAD_ACPI_DEBUGFACILITIES is not set +# CONFIG_THINKPAD_ACPI_DEBUG is not set +# CONFIG_THINKPAD_ACPI_UNSAFE_LEDS is not set +CONFIG_THINKPAD_ACPI_VIDEO=y +CONFIG_THINKPAD_ACPI_HOTKEY_POLL=y +CONFIG_SENSORS_HDAPS=m +CONFIG_INTEL_MENLOW=m +CONFIG_EEEPC_LAPTOP=m +CONFIG_ASUS_WMI=m +CONFIG_ASUS_NB_WMI=m +CONFIG_EEEPC_WMI=m +CONFIG_ASUS_WIRELESS=m +CONFIG_ACPI_WMI=m +CONFIG_WMI_BMOF=m +CONFIG_INTEL_WMI_THUNDERBOLT=m +CONFIG_XIAOMI_WMI=m +CONFIG_MSI_WMI=m +CONFIG_PEAQ_WMI=m +CONFIG_TOPSTAR_LAPTOP=m +CONFIG_ACPI_TOSHIBA=m +CONFIG_TOSHIBA_BT_RFKILL=m +CONFIG_TOSHIBA_HAPS=m +CONFIG_TOSHIBA_WMI=m +CONFIG_ACPI_CMPC=m +CONFIG_INTEL_CHT_INT33FE=m +CONFIG_INTEL_INT0002_VGPIO=m +CONFIG_INTEL_HID_EVENT=m +CONFIG_INTEL_VBTN=m +CONFIG_INTEL_IPS=m +CONFIG_INTEL_PMC_CORE=y +CONFIG_IBM_RTL=m +CONFIG_SAMSUNG_LAPTOP=m +CONFIG_MXM_WMI=m +CONFIG_INTEL_OAKTRAIL=m +CONFIG_SAMSUNG_Q10=m +CONFIG_APPLE_GMUX=m +CONFIG_INTEL_RST=m +CONFIG_INTEL_SMARTCONNECT=m +CONFIG_INTEL_PMC_IPC=m +CONFIG_INTEL_BXTWC_PMIC_TMU=m +CONFIG_SURFACE_PRO3_BUTTON=m +CONFIG_SURFACE_3_BUTTON=m +CONFIG_INTEL_PUNIT_IPC=m +CONFIG_INTEL_TELEMETRY=m +CONFIG_MLX_PLATFORM=m +CONFIG_INTEL_TURBO_MAX_3=y +CONFIG_TOUCHSCREEN_DMI=y +CONFIG_INTEL_CHTDC_TI_PWRBTN=m +CONFIG_I2C_MULTI_INSTANTIATE=m +CONFIG_INTEL_ATOMISP2_PM=m +CONFIG_HUAWEI_WMI=m +CONFIG_PCENGINES_APU2=m + +# +# Intel Speed Select Technology interface support +# +CONFIG_INTEL_SPEED_SELECT_INTERFACE=m +# end of Intel Speed Select Technology interface support + +CONFIG_SYSTEM76_ACPI=m +CONFIG_PMC_ATOM=y +CONFIG_MFD_CROS_EC=m +CONFIG_CHROME_PLATFORMS=y +CONFIG_CHROMEOS_LAPTOP=m +CONFIG_CHROMEOS_PSTORE=m +CONFIG_CHROMEOS_TBMC=m +CONFIG_CROS_EC=m +CONFIG_CROS_EC_I2C=m +CONFIG_CROS_EC_RPMSG=m +CONFIG_CROS_EC_ISHTP=m +CONFIG_CROS_EC_SPI=m +CONFIG_CROS_EC_LPC=m +CONFIG_CROS_EC_PROTO=y +CONFIG_CROS_KBD_LED_BACKLIGHT=m +CONFIG_CROS_EC_CHARDEV=m +CONFIG_CROS_EC_LIGHTBAR=m +CONFIG_CROS_EC_VBC=m +CONFIG_CROS_EC_DEBUGFS=m +CONFIG_CROS_EC_SENSORHUB=m +CONFIG_CROS_EC_SYSFS=m +CONFIG_CROS_USBPD_LOGGER=m +CONFIG_WILCO_EC=m +CONFIG_WILCO_EC_DEBUGFS=m +CONFIG_WILCO_EC_EVENTS=m +CONFIG_WILCO_EC_TELEMETRY=m +CONFIG_MELLANOX_PLATFORM=y +CONFIG_MLXREG_HOTPLUG=m +CONFIG_MLXREG_IO=m +CONFIG_CLKDEV_LOOKUP=y +CONFIG_HAVE_CLK_PREPARE=y +CONFIG_COMMON_CLK=y + +# +# Common Clock Framework +# +CONFIG_COMMON_CLK_WM831X=m +CONFIG_CLK_HSDK=y +CONFIG_COMMON_CLK_MAX77686=m +CONFIG_COMMON_CLK_MAX9485=m +CONFIG_COMMON_CLK_RK808=m +CONFIG_COMMON_CLK_SI5341=m +CONFIG_COMMON_CLK_SI5351=m +CONFIG_COMMON_CLK_SI514=m +CONFIG_COMMON_CLK_SI544=m +CONFIG_COMMON_CLK_SI570=m +CONFIG_COMMON_CLK_CDCE706=m +CONFIG_COMMON_CLK_CDCE925=m +CONFIG_COMMON_CLK_CS2000_CP=m +CONFIG_COMMON_CLK_S2MPS11=m +CONFIG_CLK_TWL6040=m +CONFIG_COMMON_CLK_LOCHNAGAR=m +CONFIG_COMMON_CLK_PALMAS=m +CONFIG_COMMON_CLK_PWM=m +CONFIG_COMMON_CLK_VC5=m +CONFIG_COMMON_CLK_BD718XX=m +CONFIG_COMMON_CLK_FIXED_MMIO=y +# end of Common Clock Framework + +CONFIG_HWSPINLOCK=y + +# +# Clock Source drivers +# +CONFIG_CLKEVT_I8253=y +CONFIG_I8253_LOCK=y +CONFIG_CLKBLD_I8253=y +# end of Clock Source drivers + +CONFIG_MAILBOX=y +CONFIG_PLATFORM_MHU=m +CONFIG_PCC=y +CONFIG_ALTERA_MBOX=m +CONFIG_MAILBOX_TEST=m +CONFIG_IOMMU_IOVA=y +CONFIG_IOMMU_API=y +CONFIG_IOMMU_SUPPORT=y + +# +# Generic IOMMU Pagetable Support +# +# end of Generic IOMMU Pagetable Support + +# CONFIG_IOMMU_DEBUGFS is not set +# CONFIG_IOMMU_DEFAULT_PASSTHROUGH is not set +CONFIG_OF_IOMMU=y +CONFIG_IOMMU_DMA=y +CONFIG_AMD_IOMMU=y +CONFIG_AMD_IOMMU_V2=y +CONFIG_DMAR_TABLE=y +CONFIG_INTEL_IOMMU=y +CONFIG_INTEL_IOMMU_SVM=y +# CONFIG_INTEL_IOMMU_DEFAULT_ON is not set +CONFIG_INTEL_IOMMU_FLOPPY_WA=y +CONFIG_IRQ_REMAP=y +CONFIG_HYPERV_IOMMU=y + +# +# Remoteproc drivers +# +CONFIG_REMOTEPROC=y +# end of Remoteproc drivers + +# +# Rpmsg drivers +# +CONFIG_RPMSG=m +CONFIG_RPMSG_CHAR=m +CONFIG_RPMSG_QCOM_GLINK_NATIVE=m +CONFIG_RPMSG_QCOM_GLINK_RPM=m +CONFIG_RPMSG_VIRTIO=m +# end of Rpmsg drivers + +CONFIG_SOUNDWIRE=m + +# +# SoundWire Devices +# +CONFIG_SOUNDWIRE_CADENCE=m +CONFIG_SOUNDWIRE_INTEL=m + +# +# SOC (System On Chip) specific Drivers +# + +# +# Amlogic SoC drivers +# +# end of Amlogic SoC drivers + +# +# Aspeed SoC drivers +# +# end of Aspeed SoC drivers + +# +# Broadcom SoC drivers +# +# end of Broadcom SoC drivers + +# +# NXP/Freescale QorIQ SoC drivers +# +# end of NXP/Freescale QorIQ SoC drivers + +# +# i.MX SoC drivers +# +# end of i.MX SoC drivers + +# +# Qualcomm SoC drivers +# +# end of Qualcomm SoC drivers + +CONFIG_SOC_TI=y + +# +# Xilinx SoC drivers +# +CONFIG_XILINX_VCU=m +# end of Xilinx SoC drivers +# end of SOC (System On Chip) specific Drivers + +CONFIG_PM_DEVFREQ=y + +# +# DEVFREQ Governors +# +CONFIG_DEVFREQ_GOV_SIMPLE_ONDEMAND=m +CONFIG_DEVFREQ_GOV_PERFORMANCE=m +CONFIG_DEVFREQ_GOV_POWERSAVE=m +CONFIG_DEVFREQ_GOV_USERSPACE=m +CONFIG_DEVFREQ_GOV_PASSIVE=m + +# +# DEVFREQ Drivers +# +CONFIG_PM_DEVFREQ_EVENT=y +CONFIG_EXTCON=y + +# +# Extcon Device Drivers +# +CONFIG_EXTCON_ADC_JACK=m +CONFIG_EXTCON_ARIZONA=m +CONFIG_EXTCON_AXP288=m +CONFIG_EXTCON_FSA9480=m +CONFIG_EXTCON_GPIO=m +CONFIG_EXTCON_INTEL_INT3496=m +CONFIG_EXTCON_INTEL_CHT_WC=m +CONFIG_EXTCON_MAX14577=m +CONFIG_EXTCON_MAX3355=m +CONFIG_EXTCON_MAX77693=m +CONFIG_EXTCON_MAX77843=m +CONFIG_EXTCON_MAX8997=m +CONFIG_EXTCON_PALMAS=m +CONFIG_EXTCON_PTN5150=m +CONFIG_EXTCON_RT8973A=m +CONFIG_EXTCON_SM5502=m +CONFIG_EXTCON_USB_GPIO=m +CONFIG_EXTCON_USBC_CROS_EC=m +CONFIG_MEMORY=y +CONFIG_IIO=m +CONFIG_IIO_BUFFER=y +CONFIG_IIO_BUFFER_CB=m +CONFIG_IIO_BUFFER_HW_CONSUMER=m +CONFIG_IIO_KFIFO_BUF=m +CONFIG_IIO_TRIGGERED_BUFFER=m +CONFIG_IIO_CONFIGFS=m +CONFIG_IIO_TRIGGER=y +CONFIG_IIO_CONSUMERS_PER_TRIGGER=2 +CONFIG_IIO_SW_DEVICE=m +CONFIG_IIO_SW_TRIGGER=m +CONFIG_IIO_TRIGGERED_EVENT=m + +# +# Accelerometers +# +CONFIG_ADIS16201=m +CONFIG_ADIS16209=m +CONFIG_ADXL372=m +CONFIG_ADXL372_SPI=m +CONFIG_ADXL372_I2C=m +CONFIG_BMA180=m +CONFIG_BMA220=m +CONFIG_BMC150_ACCEL=m +CONFIG_BMC150_ACCEL_I2C=m +CONFIG_BMC150_ACCEL_SPI=m +CONFIG_DA280=m +CONFIG_DA311=m +CONFIG_DMARD06=m +CONFIG_DMARD09=m +CONFIG_DMARD10=m +CONFIG_HID_SENSOR_ACCEL_3D=m +CONFIG_IIO_CROS_EC_ACCEL_LEGACY=m +CONFIG_IIO_ST_ACCEL_3AXIS=m +CONFIG_IIO_ST_ACCEL_I2C_3AXIS=m +CONFIG_IIO_ST_ACCEL_SPI_3AXIS=m +CONFIG_KXSD9=m +CONFIG_KXSD9_SPI=m +CONFIG_KXSD9_I2C=m +CONFIG_KXCJK1013=m +CONFIG_MC3230=m +CONFIG_MMA7455=m +CONFIG_MMA7455_I2C=m +CONFIG_MMA7455_SPI=m +CONFIG_MMA7660=m +CONFIG_MMA8452=m +CONFIG_MMA9551_CORE=m +CONFIG_MMA9551=m +CONFIG_MMA9553=m +CONFIG_MXC4005=m +CONFIG_MXC6255=m +CONFIG_SCA3000=m +CONFIG_STK8312=m +CONFIG_STK8BA50=m +# end of Accelerometers + +# +# Analog to digital converters +# +CONFIG_AD_SIGMA_DELTA=m +CONFIG_AD7124=m +CONFIG_AD7266=m +CONFIG_AD7291=m +CONFIG_AD7292=m +CONFIG_AD7298=m +CONFIG_AD7476=m +CONFIG_AD7606=m +CONFIG_AD7606_IFACE_PARALLEL=m +CONFIG_AD7606_IFACE_SPI=m +CONFIG_AD7766=m +CONFIG_AD7768_1=m +CONFIG_AD7780=m +CONFIG_AD7791=m +CONFIG_AD7793=m +CONFIG_AD7887=m +CONFIG_AD7923=m +CONFIG_AD7949=m +CONFIG_AD799X=m +CONFIG_AXP20X_ADC=m +CONFIG_AXP288_ADC=m +CONFIG_CC10001_ADC=m +CONFIG_CPCAP_ADC=m +CONFIG_DA9150_GPADC=m +CONFIG_DLN2_ADC=m +CONFIG_ENVELOPE_DETECTOR=m +CONFIG_HI8435=m +CONFIG_HX711=m +CONFIG_INA2XX_ADC=m +CONFIG_LP8788_ADC=m +CONFIG_LTC2471=m +CONFIG_LTC2485=m +CONFIG_LTC2497=m +CONFIG_MAX1027=m +CONFIG_MAX11100=m +CONFIG_MAX1118=m +CONFIG_MAX1363=m +CONFIG_MAX9611=m +CONFIG_MCP320X=m +CONFIG_MCP3422=m +CONFIG_MCP3911=m +CONFIG_MEN_Z188_ADC=m +CONFIG_NAU7802=m +CONFIG_PALMAS_GPADC=m +CONFIG_QCOM_VADC_COMMON=m +CONFIG_QCOM_SPMI_IADC=m +CONFIG_QCOM_SPMI_VADC=m +CONFIG_QCOM_SPMI_ADC5=m +CONFIG_SD_ADC_MODULATOR=m +CONFIG_STMPE_ADC=m +CONFIG_TI_ADC081C=m +CONFIG_TI_ADC0832=m +CONFIG_TI_ADC084S021=m +CONFIG_TI_ADC12138=m +CONFIG_TI_ADC108S102=m +CONFIG_TI_ADC128S052=m +CONFIG_TI_ADC161S626=m +CONFIG_TI_ADS1015=m +CONFIG_TI_ADS7950=m +CONFIG_TI_ADS8344=m +CONFIG_TI_ADS8688=m +CONFIG_TI_ADS124S08=m +CONFIG_TI_AM335X_ADC=m +CONFIG_TI_TLC4541=m +CONFIG_TWL4030_MADC=m +CONFIG_TWL6030_GPADC=m +CONFIG_VF610_ADC=m +CONFIG_VIPERBOARD_ADC=m +CONFIG_XILINX_XADC=m +# end of Analog to digital converters + +# +# Analog Front Ends +# +CONFIG_IIO_RESCALE=m +# end of Analog Front Ends + +# +# Amplifiers +# +CONFIG_AD8366=m +# end of Amplifiers + +# +# Chemical Sensors +# +CONFIG_ATLAS_PH_SENSOR=m +CONFIG_BME680=m +CONFIG_BME680_I2C=m +CONFIG_BME680_SPI=m +CONFIG_CCS811=m +CONFIG_IAQCORE=m +CONFIG_PMS7003=m +CONFIG_SENSIRION_SGP30=m +CONFIG_SPS30=m +CONFIG_VZ89X=m +# end of Chemical Sensors + +CONFIG_IIO_CROS_EC_SENSORS_CORE=m +CONFIG_IIO_CROS_EC_SENSORS=m +CONFIG_IIO_CROS_EC_SENSORS_LID_ANGLE=m + +# +# Hid Sensor IIO Common +# +CONFIG_HID_SENSOR_IIO_COMMON=m +CONFIG_HID_SENSOR_IIO_TRIGGER=m +# end of Hid Sensor IIO Common + +CONFIG_IIO_MS_SENSORS_I2C=m + +# +# SSP Sensor Common +# +CONFIG_IIO_SSP_SENSORS_COMMONS=m +CONFIG_IIO_SSP_SENSORHUB=m +# end of SSP Sensor Common + +CONFIG_IIO_ST_SENSORS_I2C=m +CONFIG_IIO_ST_SENSORS_SPI=m +CONFIG_IIO_ST_SENSORS_CORE=m + +# +# Digital to analog converters +# +CONFIG_AD5064=m +CONFIG_AD5360=m +CONFIG_AD5380=m +CONFIG_AD5421=m +CONFIG_AD5446=m +CONFIG_AD5449=m +CONFIG_AD5592R_BASE=m +CONFIG_AD5592R=m +CONFIG_AD5593R=m +CONFIG_AD5504=m +CONFIG_AD5624R_SPI=m +CONFIG_LTC1660=m +CONFIG_LTC2632=m +CONFIG_AD5686=m +CONFIG_AD5686_SPI=m +CONFIG_AD5696_I2C=m +CONFIG_AD5755=m +CONFIG_AD5758=m +CONFIG_AD5761=m +CONFIG_AD5764=m +CONFIG_AD5791=m +CONFIG_AD7303=m +CONFIG_AD8801=m +CONFIG_DPOT_DAC=m +CONFIG_DS4424=m +CONFIG_M62332=m +CONFIG_MAX517=m +CONFIG_MAX5821=m +CONFIG_MCP4725=m +CONFIG_MCP4922=m +CONFIG_TI_DAC082S085=m +CONFIG_TI_DAC5571=m +CONFIG_TI_DAC7311=m +CONFIG_TI_DAC7612=m +CONFIG_VF610_DAC=m +# end of Digital to analog converters + +# +# IIO dummy driver +# +# CONFIG_IIO_SIMPLE_DUMMY is not set +# end of IIO dummy driver + +# +# Frequency Synthesizers DDS/PLL +# + +# +# Clock Generator/Distribution +# +CONFIG_AD9523=m +# end of Clock Generator/Distribution + +# +# Phase-Locked Loop (PLL) frequency synthesizers +# +CONFIG_ADF4350=m +CONFIG_ADF4371=m +# end of Phase-Locked Loop (PLL) frequency synthesizers +# end of Frequency Synthesizers DDS/PLL + +# +# Digital gyroscope sensors +# +CONFIG_ADIS16080=m +CONFIG_ADIS16130=m +CONFIG_ADIS16136=m +CONFIG_ADIS16260=m +CONFIG_ADXRS450=m +CONFIG_BMG160=m +CONFIG_BMG160_I2C=m +CONFIG_BMG160_SPI=m +CONFIG_FXAS21002C=m +CONFIG_FXAS21002C_I2C=m +CONFIG_FXAS21002C_SPI=m +CONFIG_HID_SENSOR_GYRO_3D=m +CONFIG_MPU3050=m +CONFIG_MPU3050_I2C=m +CONFIG_IIO_ST_GYRO_3AXIS=m +CONFIG_IIO_ST_GYRO_I2C_3AXIS=m +CONFIG_IIO_ST_GYRO_SPI_3AXIS=m +CONFIG_ITG3200=m +# end of Digital gyroscope sensors + +# +# Health Sensors +# + +# +# Heart Rate Monitors +# +CONFIG_AFE4403=m +CONFIG_AFE4404=m +CONFIG_MAX30100=m +CONFIG_MAX30102=m +# end of Heart Rate Monitors +# end of Health Sensors + +# +# Humidity sensors +# +CONFIG_AM2315=m +CONFIG_DHT11=m +CONFIG_HDC100X=m +CONFIG_HID_SENSOR_HUMIDITY=m +CONFIG_HTS221=m +CONFIG_HTS221_I2C=m +CONFIG_HTS221_SPI=m +CONFIG_HTU21=m +CONFIG_SI7005=m +CONFIG_SI7020=m +# end of Humidity sensors + +# +# Inertial measurement units +# +CONFIG_ADIS16400=m +CONFIG_ADIS16460=m +CONFIG_ADIS16480=m +CONFIG_BMI160=m +CONFIG_BMI160_I2C=m +CONFIG_BMI160_SPI=m +CONFIG_FXOS8700=m +CONFIG_FXOS8700_I2C=m +CONFIG_FXOS8700_SPI=m +CONFIG_KMX61=m +CONFIG_INV_MPU6050_IIO=m +CONFIG_INV_MPU6050_I2C=m +CONFIG_INV_MPU6050_SPI=m +CONFIG_IIO_ST_LSM6DSX=m +CONFIG_IIO_ST_LSM6DSX_I2C=m +CONFIG_IIO_ST_LSM6DSX_SPI=m +CONFIG_IIO_ST_LSM6DSX_I3C=m +# end of Inertial measurement units + +CONFIG_IIO_ADIS_LIB=m +CONFIG_IIO_ADIS_LIB_BUFFER=y + +# +# Light sensors +# +CONFIG_ACPI_ALS=m +CONFIG_ADJD_S311=m +CONFIG_ADUX1020=m +CONFIG_AL3320A=m +CONFIG_APDS9300=m +CONFIG_APDS9960=m +CONFIG_BH1750=m +CONFIG_BH1780=m +CONFIG_CM32181=m +CONFIG_CM3232=m +CONFIG_CM3323=m +CONFIG_CM3605=m +CONFIG_CM36651=m +CONFIG_IIO_CROS_EC_LIGHT_PROX=m +CONFIG_GP2AP020A00F=m +CONFIG_SENSORS_ISL29018=m +CONFIG_SENSORS_ISL29028=m +CONFIG_ISL29125=m +CONFIG_HID_SENSOR_ALS=m +CONFIG_HID_SENSOR_PROX=m +CONFIG_JSA1212=m +CONFIG_RPR0521=m +CONFIG_SENSORS_LM3533=m +CONFIG_LTR501=m +CONFIG_LV0104CS=m +CONFIG_MAX44000=m +CONFIG_MAX44009=m +CONFIG_NOA1305=m +CONFIG_OPT3001=m +CONFIG_PA12203001=m +CONFIG_SI1133=m +CONFIG_SI1145=m +CONFIG_STK3310=m +CONFIG_ST_UVIS25=m +CONFIG_ST_UVIS25_I2C=m +CONFIG_ST_UVIS25_SPI=m +CONFIG_TCS3414=m +CONFIG_TCS3472=m +CONFIG_SENSORS_TSL2563=m +CONFIG_TSL2583=m +CONFIG_TSL2772=m +CONFIG_TSL4531=m +CONFIG_US5182D=m +CONFIG_VCNL4000=m +CONFIG_VCNL4035=m +CONFIG_VEML6030=m +CONFIG_VEML6070=m +CONFIG_VL6180=m +CONFIG_ZOPT2201=m +# end of Light sensors + +# +# Magnetometer sensors +# +CONFIG_AK8974=m +CONFIG_AK8975=m +CONFIG_AK09911=m +CONFIG_BMC150_MAGN=m +CONFIG_BMC150_MAGN_I2C=m +CONFIG_BMC150_MAGN_SPI=m +CONFIG_MAG3110=m +CONFIG_HID_SENSOR_MAGNETOMETER_3D=m +CONFIG_MMC35240=m +CONFIG_IIO_ST_MAGN_3AXIS=m +CONFIG_IIO_ST_MAGN_I2C_3AXIS=m +CONFIG_IIO_ST_MAGN_SPI_3AXIS=m +CONFIG_SENSORS_HMC5843=m +CONFIG_SENSORS_HMC5843_I2C=m +CONFIG_SENSORS_HMC5843_SPI=m +CONFIG_SENSORS_RM3100=m +CONFIG_SENSORS_RM3100_I2C=m +CONFIG_SENSORS_RM3100_SPI=m +# end of Magnetometer sensors + +# +# Multiplexers +# +CONFIG_IIO_MUX=m +# end of Multiplexers + +# +# Inclinometer sensors +# +CONFIG_HID_SENSOR_INCLINOMETER_3D=m +CONFIG_HID_SENSOR_DEVICE_ROTATION=m +# end of Inclinometer sensors + +# +# Triggers - standalone +# +CONFIG_IIO_HRTIMER_TRIGGER=m +CONFIG_IIO_INTERRUPT_TRIGGER=m +CONFIG_IIO_TIGHTLOOP_TRIGGER=m +CONFIG_IIO_SYSFS_TRIGGER=m +# end of Triggers - standalone + +# +# Digital potentiometers +# +CONFIG_AD5272=m +CONFIG_DS1803=m +CONFIG_MAX5432=m +CONFIG_MAX5481=m +CONFIG_MAX5487=m +CONFIG_MCP4018=m +CONFIG_MCP4131=m +CONFIG_MCP4531=m +CONFIG_MCP41010=m +CONFIG_TPL0102=m +# end of Digital potentiometers + +# +# Digital potentiostats +# +CONFIG_LMP91000=m +# end of Digital potentiostats + +# +# Pressure sensors +# +CONFIG_ABP060MG=m +CONFIG_BMP280=m +CONFIG_BMP280_I2C=m +CONFIG_BMP280_SPI=m +CONFIG_IIO_CROS_EC_BARO=m +CONFIG_DPS310=m +CONFIG_HID_SENSOR_PRESS=m +CONFIG_HP03=m +CONFIG_MPL115=m +CONFIG_MPL115_I2C=m +CONFIG_MPL115_SPI=m +CONFIG_MPL3115=m +CONFIG_MS5611=m +CONFIG_MS5611_I2C=m +CONFIG_MS5611_SPI=m +CONFIG_MS5637=m +CONFIG_IIO_ST_PRESS=m +CONFIG_IIO_ST_PRESS_I2C=m +CONFIG_IIO_ST_PRESS_SPI=m +CONFIG_T5403=m +CONFIG_HP206C=m +CONFIG_ZPA2326=m +CONFIG_ZPA2326_I2C=m +CONFIG_ZPA2326_SPI=m +# end of Pressure sensors + +# +# Lightning sensors +# +CONFIG_AS3935=m +# end of Lightning sensors + +# +# Proximity and distance sensors +# +CONFIG_ISL29501=m +CONFIG_LIDAR_LITE_V2=m +CONFIG_MB1232=m +CONFIG_RFD77402=m +CONFIG_SRF04=m +CONFIG_SX9500=m +CONFIG_SRF08=m +CONFIG_VL53L0X_I2C=m +# end of Proximity and distance sensors + +# +# Resolver to digital converters +# +CONFIG_AD2S90=m +CONFIG_AD2S1200=m +# end of Resolver to digital converters + +# +# Temperature sensors +# +CONFIG_LTC2983=m +CONFIG_MAXIM_THERMOCOUPLE=m +CONFIG_HID_SENSOR_TEMP=m +CONFIG_MLX90614=m +CONFIG_MLX90632=m +CONFIG_TMP006=m +CONFIG_TMP007=m +CONFIG_TSYS01=m +CONFIG_TSYS02D=m +CONFIG_MAX31856=m +# end of Temperature sensors + +CONFIG_NTB=m +CONFIG_NTB_MSI=y +CONFIG_NTB_AMD=m +CONFIG_NTB_IDT=m +CONFIG_NTB_INTEL=m +CONFIG_NTB_SWITCHTEC=m +# CONFIG_NTB_PINGPONG is not set +# CONFIG_NTB_TOOL is not set +# CONFIG_NTB_PERF is not set +# CONFIG_NTB_MSI_TEST is not set +CONFIG_NTB_TRANSPORT=m +CONFIG_VME_BUS=y + +# +# VME Bridge Drivers +# +CONFIG_VME_CA91CX42=m +CONFIG_VME_TSI148=m +# CONFIG_VME_FAKE is not set + +# +# VME Board Drivers +# +CONFIG_VMIVME_7805=m + +# +# VME Device Drivers +# +CONFIG_VME_USER=m +CONFIG_PWM=y +CONFIG_PWM_SYSFS=y +CONFIG_PWM_ATMEL_HLCDC_PWM=m +CONFIG_PWM_CRC=y +CONFIG_PWM_CROS_EC=m +CONFIG_PWM_FSL_FTM=m +CONFIG_PWM_LP3943=m +CONFIG_PWM_LPSS=m +CONFIG_PWM_LPSS_PCI=m +CONFIG_PWM_LPSS_PLATFORM=m +CONFIG_PWM_PCA9685=m +CONFIG_PWM_STMPE=y +CONFIG_PWM_TWL=m +CONFIG_PWM_TWL_LED=m + +# +# IRQ chip support +# +CONFIG_IRQCHIP=y +CONFIG_AL_FIC=y +CONFIG_MADERA_IRQ=m +# end of IRQ chip support + +CONFIG_IPACK_BUS=m +CONFIG_BOARD_TPCI200=m +CONFIG_SERIAL_IPOCTAL=m +CONFIG_RESET_CONTROLLER=y +CONFIG_RESET_TI_SYSCON=m + +# +# PHY Subsystem +# +CONFIG_GENERIC_PHY=y +CONFIG_GENERIC_PHY_MIPI_DPHY=y +CONFIG_BCM_KONA_USB2_PHY=m +CONFIG_PHY_CADENCE_DP=m +CONFIG_PHY_CADENCE_DPHY=m +CONFIG_PHY_CADENCE_SIERRA=m +CONFIG_PHY_FSL_IMX8MQ_USB=m +CONFIG_PHY_MIXEL_MIPI_DPHY=m +CONFIG_PHY_PXA_28NM_HSIC=m +CONFIG_PHY_PXA_28NM_USB2=m +CONFIG_PHY_CPCAP_USB=m +CONFIG_PHY_MAPPHONE_MDM6600=m +CONFIG_PHY_OCELOT_SERDES=m +CONFIG_PHY_QCOM_USB_HS=m +CONFIG_PHY_QCOM_USB_HSIC=m +CONFIG_PHY_SAMSUNG_USB2=m +CONFIG_PHY_TUSB1210=m +# end of PHY Subsystem + +CONFIG_POWERCAP=y +CONFIG_INTEL_RAPL_CORE=m +CONFIG_INTEL_RAPL=m +CONFIG_IDLE_INJECT=y +CONFIG_MCB=m +CONFIG_MCB_PCI=m +CONFIG_MCB_LPC=m + +# +# Performance monitor support +# +# end of Performance monitor support + +CONFIG_RAS=y +CONFIG_RAS_CEC=y +# CONFIG_RAS_CEC_DEBUG is not set +CONFIG_THUNDERBOLT=m + +# +# Android +# +# CONFIG_ANDROID is not set +# end of Android + +CONFIG_LIBNVDIMM=y +CONFIG_BLK_DEV_PMEM=m +CONFIG_ND_BLK=m +CONFIG_ND_CLAIM=y +CONFIG_ND_BTT=m +CONFIG_BTT=y +CONFIG_ND_PFN=m +CONFIG_NVDIMM_PFN=y +CONFIG_NVDIMM_DAX=y +CONFIG_OF_PMEM=m +CONFIG_DAX_DRIVER=y +CONFIG_DAX=y +CONFIG_DEV_DAX=m +CONFIG_DEV_DAX_PMEM=m +CONFIG_DEV_DAX_HMEM=m +CONFIG_DEV_DAX_KMEM=m +CONFIG_DEV_DAX_PMEM_COMPAT=m +CONFIG_NVMEM=y +CONFIG_NVMEM_SYSFS=y +CONFIG_RAVE_SP_EEPROM=m + +# +# HW tracing support +# +CONFIG_STM=m +CONFIG_STM_PROTO_BASIC=m +CONFIG_STM_PROTO_SYS_T=m +# CONFIG_STM_DUMMY is not set +CONFIG_STM_SOURCE_CONSOLE=m +CONFIG_STM_SOURCE_HEARTBEAT=m +CONFIG_STM_SOURCE_FTRACE=m +CONFIG_INTEL_TH=m +CONFIG_INTEL_TH_PCI=m +CONFIG_INTEL_TH_ACPI=m +CONFIG_INTEL_TH_GTH=m +CONFIG_INTEL_TH_STH=m +CONFIG_INTEL_TH_MSU=m +CONFIG_INTEL_TH_PTI=m +# CONFIG_INTEL_TH_DEBUG is not set +# end of HW tracing support + +CONFIG_FPGA=m +CONFIG_ALTERA_PR_IP_CORE=m +CONFIG_ALTERA_PR_IP_CORE_PLAT=m +CONFIG_FPGA_MGR_ALTERA_PS_SPI=m +CONFIG_FPGA_MGR_ALTERA_CVP=m +CONFIG_FPGA_MGR_XILINX_SPI=m +CONFIG_FPGA_MGR_ICE40_SPI=m +CONFIG_FPGA_MGR_MACHXO2_SPI=m +CONFIG_FPGA_BRIDGE=m +CONFIG_ALTERA_FREEZE_BRIDGE=m +CONFIG_XILINX_PR_DECOUPLER=m +CONFIG_FPGA_REGION=m +CONFIG_OF_FPGA_REGION=m +CONFIG_FPGA_DFL=m +CONFIG_FPGA_DFL_FME=m +CONFIG_FPGA_DFL_FME_MGR=m +CONFIG_FPGA_DFL_FME_BRIDGE=m +CONFIG_FPGA_DFL_FME_REGION=m +CONFIG_FPGA_DFL_AFU=m +CONFIG_FPGA_DFL_PCI=m +CONFIG_FSI=m +CONFIG_FSI_NEW_DEV_NODE=y +CONFIG_FSI_MASTER_GPIO=m +CONFIG_FSI_MASTER_HUB=m +CONFIG_FSI_MASTER_ASPEED=m +CONFIG_FSI_SCOM=m +CONFIG_FSI_SBEFIFO=m +CONFIG_FSI_OCC=m +CONFIG_MULTIPLEXER=m + +# +# Multiplexer drivers +# +CONFIG_MUX_ADG792A=m +CONFIG_MUX_ADGS1408=m +CONFIG_MUX_GPIO=m +CONFIG_MUX_MMIO=m +# end of Multiplexer drivers + +CONFIG_PM_OPP=y +CONFIG_UNISYS_VISORBUS=m +CONFIG_SIOX=m +CONFIG_SIOX_BUS_GPIO=m +CONFIG_SLIMBUS=m +CONFIG_SLIM_QCOM_CTRL=m +CONFIG_INTERCONNECT=m +CONFIG_COUNTER=m +CONFIG_FTM_QUADDEC=m +# end of Device Drivers + +# +# File systems +# +CONFIG_DCACHE_WORD_ACCESS=y +CONFIG_VALIDATE_FS_PARSER=y +CONFIG_FS_IOMAP=y +# CONFIG_EXT2_FS is not set +# CONFIG_EXT3_FS is not set +CONFIG_EXT4_FS=m +CONFIG_EXT4_USE_FOR_EXT2=y +CONFIG_EXT4_FS_POSIX_ACL=y +CONFIG_EXT4_FS_SECURITY=y +# CONFIG_EXT4_DEBUG is not set +CONFIG_JBD2=m +# CONFIG_JBD2_DEBUG is not set +CONFIG_FS_MBCACHE=m +CONFIG_REISERFS_FS=m +# CONFIG_REISERFS_CHECK is not set +CONFIG_REISERFS_PROC_INFO=y +CONFIG_REISERFS_FS_XATTR=y +CONFIG_REISERFS_FS_POSIX_ACL=y +CONFIG_REISERFS_FS_SECURITY=y +CONFIG_JFS_FS=m +CONFIG_JFS_POSIX_ACL=y +CONFIG_JFS_SECURITY=y +# CONFIG_JFS_DEBUG is not set +CONFIG_JFS_STATISTICS=y +CONFIG_XFS_FS=m +CONFIG_XFS_QUOTA=y +CONFIG_XFS_POSIX_ACL=y +CONFIG_XFS_RT=y +CONFIG_XFS_ONLINE_SCRUB=y +CONFIG_XFS_ONLINE_REPAIR=y +# CONFIG_XFS_WARN is not set +# CONFIG_XFS_DEBUG is not set +CONFIG_GFS2_FS=m +CONFIG_GFS2_FS_LOCKING_DLM=y +CONFIG_OCFS2_FS=m +CONFIG_OCFS2_FS_O2CB=m +CONFIG_OCFS2_FS_USERSPACE_CLUSTER=m +CONFIG_OCFS2_FS_STATS=y +CONFIG_OCFS2_DEBUG_MASKLOG=y +# CONFIG_OCFS2_DEBUG_FS is not set +CONFIG_BTRFS_FS=m +CONFIG_BTRFS_FS_POSIX_ACL=y +# CONFIG_BTRFS_FS_CHECK_INTEGRITY is not set +# CONFIG_BTRFS_FS_RUN_SANITY_TESTS is not set +# CONFIG_BTRFS_DEBUG is not set +# CONFIG_BTRFS_ASSERT is not set +# CONFIG_BTRFS_FS_REF_VERIFY is not set +CONFIG_NILFS2_FS=m +CONFIG_F2FS_FS=m +CONFIG_F2FS_STAT_FS=y +CONFIG_F2FS_FS_XATTR=y +CONFIG_F2FS_FS_POSIX_ACL=y +CONFIG_F2FS_FS_SECURITY=y +CONFIG_F2FS_CHECK_FS=y +# CONFIG_F2FS_IO_TRACE is not set +# CONFIG_F2FS_FAULT_INJECTION is not set +CONFIG_FS_DAX=y +CONFIG_FS_DAX_PMD=y +CONFIG_FS_POSIX_ACL=y +CONFIG_EXPORTFS=y +CONFIG_EXPORTFS_BLOCK_OPS=y +CONFIG_FILE_LOCKING=y +# CONFIG_MANDATORY_FILE_LOCKING is not set +CONFIG_FS_ENCRYPTION=y +CONFIG_FS_VERITY=y +# CONFIG_FS_VERITY_DEBUG is not set +CONFIG_FS_VERITY_BUILTIN_SIGNATURES=y +CONFIG_FSNOTIFY=y +CONFIG_DNOTIFY=y +CONFIG_INOTIFY_USER=y +CONFIG_FANOTIFY=y +CONFIG_FANOTIFY_ACCESS_PERMISSIONS=y +CONFIG_QUOTA=y +CONFIG_QUOTA_NETLINK_INTERFACE=y +# CONFIG_PRINT_QUOTA_WARNING is not set +# CONFIG_QUOTA_DEBUG is not set +CONFIG_QUOTA_TREE=m +CONFIG_QFMT_V1=m +CONFIG_QFMT_V2=m +CONFIG_QUOTACTL=y +CONFIG_QUOTACTL_COMPAT=y +CONFIG_AUTOFS4_FS=y +CONFIG_AUTOFS_FS=y +CONFIG_FUSE_FS=m +CONFIG_CUSE=m +CONFIG_VIRTIO_FS=m +CONFIG_OVERLAY_FS=m +CONFIG_OVERLAY_FS_REDIRECT_DIR=y +# CONFIG_OVERLAY_FS_REDIRECT_ALWAYS_FOLLOW is not set +CONFIG_OVERLAY_FS_INDEX=y +CONFIG_OVERLAY_FS_XINO_AUTO=y +CONFIG_OVERLAY_FS_METACOPY=y + +# +# Caches +# +CONFIG_FSCACHE=m +CONFIG_FSCACHE_STATS=y +CONFIG_FSCACHE_HISTOGRAM=y +# CONFIG_FSCACHE_DEBUG is not set +# CONFIG_FSCACHE_OBJECT_LIST is not set +CONFIG_CACHEFILES=m +# CONFIG_CACHEFILES_DEBUG is not set +# CONFIG_CACHEFILES_HISTOGRAM is not set +# end of Caches + +# +# CD-ROM/DVD Filesystems +# +CONFIG_ISO9660_FS=m +CONFIG_JOLIET=y +CONFIG_ZISOFS=y +CONFIG_UDF_FS=m +# end of CD-ROM/DVD Filesystems + +# +# DOS/FAT/NT Filesystems +# +CONFIG_FAT_FS=m +CONFIG_MSDOS_FS=m +CONFIG_VFAT_FS=m +CONFIG_FAT_DEFAULT_CODEPAGE=437 +CONFIG_FAT_DEFAULT_IOCHARSET="iso8859-1" +CONFIG_FAT_DEFAULT_UTF8=y +CONFIG_NTFS_FS=m +# CONFIG_NTFS_DEBUG is not set +CONFIG_NTFS_RW=y +# end of DOS/FAT/NT Filesystems + +# +# Pseudo filesystems +# +CONFIG_PROC_FS=y +CONFIG_PROC_KCORE=y +CONFIG_PROC_VMCORE=y +CONFIG_PROC_VMCORE_DEVICE_DUMP=y +CONFIG_PROC_SYSCTL=y +CONFIG_PROC_PAGE_MONITOR=y +CONFIG_PROC_CHILDREN=y +CONFIG_PROC_PID_ARCH_STATUS=y +CONFIG_KERNFS=y +CONFIG_SYSFS=y +CONFIG_TMPFS=y +CONFIG_TMPFS_POSIX_ACL=y +CONFIG_TMPFS_XATTR=y +CONFIG_HUGETLBFS=y +CONFIG_HUGETLB_PAGE=y +CONFIG_MEMFD_CREATE=y +CONFIG_ARCH_HAS_GIGANTIC_PAGE=y +CONFIG_CONFIGFS_FS=y +CONFIG_EFIVAR_FS=y +# end of Pseudo filesystems + +CONFIG_MISC_FILESYSTEMS=y +CONFIG_ORANGEFS_FS=m +# CONFIG_ADFS_FS is not set +CONFIG_AFFS_FS=m +CONFIG_ECRYPT_FS=m +# CONFIG_ECRYPT_FS_MESSAGING is not set +CONFIG_HFS_FS=m +CONFIG_HFSPLUS_FS=m +CONFIG_BEFS_FS=m +# CONFIG_BEFS_DEBUG is not set +# CONFIG_BFS_FS is not set +# CONFIG_EFS_FS is not set +CONFIG_JFFS2_FS=m +CONFIG_JFFS2_FS_DEBUG=0 +CONFIG_JFFS2_FS_WRITEBUFFER=y +# CONFIG_JFFS2_FS_WBUF_VERIFY is not set +CONFIG_JFFS2_SUMMARY=y +CONFIG_JFFS2_FS_XATTR=y +CONFIG_JFFS2_FS_POSIX_ACL=y +CONFIG_JFFS2_FS_SECURITY=y +# CONFIG_JFFS2_COMPRESSION_OPTIONS is not set +CONFIG_JFFS2_ZLIB=y +CONFIG_JFFS2_RTIME=y +CONFIG_UBIFS_FS=m +# CONFIG_UBIFS_FS_ADVANCED_COMPR is not set +CONFIG_UBIFS_FS_LZO=y +CONFIG_UBIFS_FS_ZLIB=y +CONFIG_UBIFS_FS_ZSTD=y +CONFIG_UBIFS_ATIME_SUPPORT=y +CONFIG_UBIFS_FS_XATTR=y +CONFIG_UBIFS_FS_SECURITY=y +CONFIG_UBIFS_FS_AUTHENTICATION=y +CONFIG_CRAMFS=m +CONFIG_CRAMFS_BLOCKDEV=y +CONFIG_CRAMFS_MTD=y +CONFIG_SQUASHFS=m +# CONFIG_SQUASHFS_FILE_CACHE is not set +CONFIG_SQUASHFS_FILE_DIRECT=y +# CONFIG_SQUASHFS_DECOMP_SINGLE is not set +CONFIG_SQUASHFS_DECOMP_MULTI=y +# CONFIG_SQUASHFS_DECOMP_MULTI_PERCPU is not set +CONFIG_SQUASHFS_XATTR=y +CONFIG_SQUASHFS_ZLIB=y +CONFIG_SQUASHFS_LZ4=y +CONFIG_SQUASHFS_LZO=y +CONFIG_SQUASHFS_XZ=y +CONFIG_SQUASHFS_ZSTD=y +# CONFIG_SQUASHFS_4K_DEVBLK_SIZE is not set +# CONFIG_SQUASHFS_EMBEDDED is not set +CONFIG_SQUASHFS_FRAGMENT_CACHE_SIZE=3 +# CONFIG_VXFS_FS is not set +CONFIG_MINIX_FS=m +CONFIG_OMFS_FS=m +# CONFIG_HPFS_FS is not set +# CONFIG_QNX4FS_FS is not set +# CONFIG_QNX6FS_FS is not set +CONFIG_ROMFS_FS=m +CONFIG_ROMFS_BACKED_BY_BLOCK=y +# CONFIG_ROMFS_BACKED_BY_MTD is not set +# CONFIG_ROMFS_BACKED_BY_BOTH is not set +CONFIG_ROMFS_ON_BLOCK=y +CONFIG_PSTORE=y +CONFIG_PSTORE_DEFLATE_COMPRESS=m +CONFIG_PSTORE_LZO_COMPRESS=m +CONFIG_PSTORE_LZ4_COMPRESS=m +CONFIG_PSTORE_LZ4HC_COMPRESS=m +# CONFIG_PSTORE_842_COMPRESS is not set +CONFIG_PSTORE_ZSTD_COMPRESS=y +CONFIG_PSTORE_COMPRESS=y +# CONFIG_PSTORE_DEFLATE_COMPRESS_DEFAULT is not set +# CONFIG_PSTORE_LZO_COMPRESS_DEFAULT is not set +# CONFIG_PSTORE_LZ4_COMPRESS_DEFAULT is not set +# CONFIG_PSTORE_LZ4HC_COMPRESS_DEFAULT is not set +CONFIG_PSTORE_ZSTD_COMPRESS_DEFAULT=y +CONFIG_PSTORE_COMPRESS_DEFAULT="zstd" +# CONFIG_PSTORE_CONSOLE is not set +# CONFIG_PSTORE_PMSG is not set +# CONFIG_PSTORE_FTRACE is not set +CONFIG_PSTORE_RAM=y +# CONFIG_SYSV_FS is not set +CONFIG_UFS_FS=m +# CONFIG_UFS_FS_WRITE is not set +# CONFIG_UFS_DEBUG is not set +CONFIG_EROFS_FS=m +# CONFIG_EROFS_FS_DEBUG is not set +CONFIG_EROFS_FS_XATTR=y +CONFIG_EROFS_FS_POSIX_ACL=y +CONFIG_EROFS_FS_SECURITY=y +CONFIG_EROFS_FS_ZIP=y +CONFIG_EROFS_FS_CLUSTER_PAGE_LIMIT=2 +CONFIG_NETWORK_FILESYSTEMS=y +CONFIG_NFS_FS=m +CONFIG_NFS_V2=m +CONFIG_NFS_V3=m +CONFIG_NFS_V3_ACL=y +CONFIG_NFS_V4=m +CONFIG_NFS_SWAP=y +CONFIG_NFS_V4_1=y +CONFIG_NFS_V4_2=y +CONFIG_PNFS_FILE_LAYOUT=m +CONFIG_PNFS_BLOCK=m +CONFIG_PNFS_FLEXFILE_LAYOUT=m +CONFIG_NFS_V4_1_IMPLEMENTATION_ID_DOMAIN="kernel.org" +CONFIG_NFS_V4_1_MIGRATION=y +CONFIG_NFS_V4_SECURITY_LABEL=y +CONFIG_NFS_FSCACHE=y +# CONFIG_NFS_USE_LEGACY_DNS is not set +CONFIG_NFS_USE_KERNEL_DNS=y +CONFIG_NFS_DEBUG=y +CONFIG_NFSD=m +CONFIG_NFSD_V2_ACL=y +CONFIG_NFSD_V3=y +CONFIG_NFSD_V3_ACL=y +CONFIG_NFSD_V4=y +CONFIG_NFSD_PNFS=y +CONFIG_NFSD_BLOCKLAYOUT=y +CONFIG_NFSD_SCSILAYOUT=y +# CONFIG_NFSD_FLEXFILELAYOUT is not set +CONFIG_NFSD_V4_SECURITY_LABEL=y +CONFIG_GRACE_PERIOD=m +CONFIG_LOCKD=m +CONFIG_LOCKD_V4=y +CONFIG_NFS_ACL_SUPPORT=m +CONFIG_NFS_COMMON=y +CONFIG_SUNRPC=m +CONFIG_SUNRPC_GSS=m +CONFIG_SUNRPC_BACKCHANNEL=y +CONFIG_SUNRPC_SWAP=y +CONFIG_RPCSEC_GSS_KRB5=m +CONFIG_SUNRPC_DISABLE_INSECURE_ENCTYPES=y +CONFIG_SUNRPC_DEBUG=y +CONFIG_SUNRPC_XPRT_RDMA=m +CONFIG_CEPH_FS=m +CONFIG_CEPH_FSCACHE=y +CONFIG_CEPH_FS_POSIX_ACL=y +CONFIG_CEPH_FS_SECURITY_LABEL=y +CONFIG_CIFS=m +# CONFIG_CIFS_STATS2 is not set +CONFIG_CIFS_ALLOW_INSECURE_LEGACY=y +# CONFIG_CIFS_WEAK_PW_HASH is not set +CONFIG_CIFS_UPCALL=y +CONFIG_CIFS_XATTR=y +CONFIG_CIFS_POSIX=y +CONFIG_CIFS_DEBUG=y +# CONFIG_CIFS_DEBUG2 is not set +# CONFIG_CIFS_DEBUG_DUMP_KEYS is not set +CONFIG_CIFS_DFS_UPCALL=y +# CONFIG_CIFS_SMB_DIRECT is not set +CONFIG_CIFS_FSCACHE=y +CONFIG_CODA_FS=m +CONFIG_AFS_FS=m +# CONFIG_AFS_DEBUG is not set +CONFIG_AFS_FSCACHE=y +# CONFIG_AFS_DEBUG_CURSOR is not set +CONFIG_9P_FS=m +CONFIG_9P_FSCACHE=y +CONFIG_9P_FS_POSIX_ACL=y +CONFIG_9P_FS_SECURITY=y +CONFIG_NLS=y +CONFIG_NLS_DEFAULT="utf8" +CONFIG_NLS_CODEPAGE_437=m +CONFIG_NLS_CODEPAGE_737=m +CONFIG_NLS_CODEPAGE_775=m +CONFIG_NLS_CODEPAGE_850=m +CONFIG_NLS_CODEPAGE_852=m +CONFIG_NLS_CODEPAGE_855=m +CONFIG_NLS_CODEPAGE_857=m +CONFIG_NLS_CODEPAGE_860=m +CONFIG_NLS_CODEPAGE_861=m +CONFIG_NLS_CODEPAGE_862=m +CONFIG_NLS_CODEPAGE_863=m +CONFIG_NLS_CODEPAGE_864=m +CONFIG_NLS_CODEPAGE_865=m +CONFIG_NLS_CODEPAGE_866=m +CONFIG_NLS_CODEPAGE_869=m +CONFIG_NLS_CODEPAGE_936=m +CONFIG_NLS_CODEPAGE_950=m +CONFIG_NLS_CODEPAGE_932=m +CONFIG_NLS_CODEPAGE_949=m +CONFIG_NLS_CODEPAGE_874=m +CONFIG_NLS_ISO8859_8=m +CONFIG_NLS_CODEPAGE_1250=m +CONFIG_NLS_CODEPAGE_1251=m +CONFIG_NLS_ASCII=m +CONFIG_NLS_ISO8859_1=m +CONFIG_NLS_ISO8859_2=m +CONFIG_NLS_ISO8859_3=m +CONFIG_NLS_ISO8859_4=m +CONFIG_NLS_ISO8859_5=m +CONFIG_NLS_ISO8859_6=m +CONFIG_NLS_ISO8859_7=m +CONFIG_NLS_ISO8859_9=m +CONFIG_NLS_ISO8859_13=m +CONFIG_NLS_ISO8859_14=m +CONFIG_NLS_ISO8859_15=m +CONFIG_NLS_KOI8_R=m +CONFIG_NLS_KOI8_U=m +CONFIG_NLS_MAC_ROMAN=m +CONFIG_NLS_MAC_CELTIC=m +CONFIG_NLS_MAC_CENTEURO=m +CONFIG_NLS_MAC_CROATIAN=m +CONFIG_NLS_MAC_CYRILLIC=m +CONFIG_NLS_MAC_GAELIC=m +CONFIG_NLS_MAC_GREEK=m +CONFIG_NLS_MAC_ICELAND=m +CONFIG_NLS_MAC_INUIT=m +CONFIG_NLS_MAC_ROMANIAN=m +CONFIG_NLS_MAC_TURKISH=m +CONFIG_NLS_UTF8=m +CONFIG_DLM=m +# CONFIG_DLM_DEBUG is not set +CONFIG_UNICODE=y +# CONFIG_UNICODE_NORMALIZATION_SELFTEST is not set +CONFIG_IO_WQ=y +# end of File systems + +# +# Security options +# +CONFIG_KEYS=y +CONFIG_KEYS_REQUEST_CACHE=y +CONFIG_PERSISTENT_KEYRINGS=y +CONFIG_BIG_KEYS=y +CONFIG_TRUSTED_KEYS=m +CONFIG_ENCRYPTED_KEYS=m +CONFIG_KEY_DH_OPERATIONS=y +# CONFIG_SECURITY_DMESG_RESTRICT is not set +CONFIG_SECURITY=y +CONFIG_SECURITYFS=y +CONFIG_SECURITY_NETWORK=y +CONFIG_PAGE_TABLE_ISOLATION=y +CONFIG_SECURITY_INFINIBAND=y +CONFIG_SECURITY_NETWORK_XFRM=y +CONFIG_SECURITY_PATH=y +# CONFIG_INTEL_TXT is not set +CONFIG_LSM_MMAP_MIN_ADDR=65536 +CONFIG_HAVE_HARDENED_USERCOPY_ALLOCATOR=y +CONFIG_HARDENED_USERCOPY=y +CONFIG_HARDENED_USERCOPY_FALLBACK=y +# CONFIG_HARDENED_USERCOPY_PAGESPAN is not set +CONFIG_FORTIFY_SOURCE=y +# CONFIG_STATIC_USERMODEHELPER is not set +CONFIG_SECURITY_SELINUX=y +CONFIG_SECURITY_SELINUX_BOOTPARAM=y +# CONFIG_SECURITY_SELINUX_DISABLE is not set +CONFIG_SECURITY_SELINUX_DEVELOP=y +CONFIG_SECURITY_SELINUX_AVC_STATS=y +CONFIG_SECURITY_SELINUX_CHECKREQPROT_VALUE=0 +CONFIG_SECURITY_SMACK=y +CONFIG_SECURITY_SMACK_BRINGUP=y +CONFIG_SECURITY_SMACK_NETFILTER=y +CONFIG_SECURITY_SMACK_APPEND_SIGNALS=y +CONFIG_SECURITY_TOMOYO=y +CONFIG_SECURITY_TOMOYO_MAX_ACCEPT_ENTRY=2048 +CONFIG_SECURITY_TOMOYO_MAX_AUDIT_LOG=1024 +# CONFIG_SECURITY_TOMOYO_OMIT_USERSPACE_LOADER is not set +CONFIG_SECURITY_TOMOYO_POLICY_LOADER="/sbin/tomoyo-init" +CONFIG_SECURITY_TOMOYO_ACTIVATION_TRIGGER="/sbin/init" +# CONFIG_SECURITY_TOMOYO_INSECURE_BUILTIN_SETTING is not set +CONFIG_SECURITY_APPARMOR=y +CONFIG_SECURITY_APPARMOR_HASH=y +CONFIG_SECURITY_APPARMOR_HASH_DEFAULT=y +# CONFIG_SECURITY_APPARMOR_DEBUG is not set +# CONFIG_SECURITY_LOADPIN is not set +CONFIG_SECURITY_YAMA=y +CONFIG_SECURITY_SAFESETID=y +CONFIG_SECURITY_LOCKDOWN_LSM=y +# CONFIG_SECURITY_LOCKDOWN_LSM_EARLY is not set +CONFIG_LOCK_DOWN_KERNEL_FORCE_NONE=y +# CONFIG_LOCK_DOWN_KERNEL_FORCE_INTEGRITY is not set +# CONFIG_LOCK_DOWN_KERNEL_FORCE_CONFIDENTIALITY is not set +# CONFIG_INTEGRITY is not set +# CONFIG_DEFAULT_SECURITY_SELINUX is not set +# CONFIG_DEFAULT_SECURITY_SMACK is not set +# CONFIG_DEFAULT_SECURITY_TOMOYO is not set +# CONFIG_DEFAULT_SECURITY_APPARMOR is not set +CONFIG_DEFAULT_SECURITY_DAC=y +CONFIG_LSM="yama" + +# +# Kernel hardening options +# +CONFIG_GCC_PLUGIN_STRUCTLEAK=y + +# +# Memory initialization +# +# CONFIG_INIT_STACK_NONE is not set +# CONFIG_GCC_PLUGIN_STRUCTLEAK_USER is not set +# CONFIG_GCC_PLUGIN_STRUCTLEAK_BYREF is not set +CONFIG_GCC_PLUGIN_STRUCTLEAK_BYREF_ALL=y +# CONFIG_GCC_PLUGIN_STRUCTLEAK_VERBOSE is not set +# CONFIG_GCC_PLUGIN_STACKLEAK is not set +CONFIG_INIT_ON_ALLOC_DEFAULT_ON=y +# CONFIG_INIT_ON_FREE_DEFAULT_ON is not set +# end of Memory initialization +# end of Kernel hardening options +# end of Security options + +CONFIG_XOR_BLOCKS=m +CONFIG_ASYNC_CORE=m +CONFIG_ASYNC_MEMCPY=m +CONFIG_ASYNC_XOR=m +CONFIG_ASYNC_PQ=m +CONFIG_ASYNC_RAID6_RECOV=m +CONFIG_CRYPTO=y + +# +# Crypto core or helper +# +CONFIG_CRYPTO_ALGAPI=y +CONFIG_CRYPTO_ALGAPI2=y +CONFIG_CRYPTO_AEAD=y +CONFIG_CRYPTO_AEAD2=y +CONFIG_CRYPTO_SKCIPHER=y +CONFIG_CRYPTO_SKCIPHER2=y +CONFIG_CRYPTO_HASH=y +CONFIG_CRYPTO_HASH2=y +CONFIG_CRYPTO_RNG=y +CONFIG_CRYPTO_RNG2=y +CONFIG_CRYPTO_RNG_DEFAULT=y +CONFIG_CRYPTO_AKCIPHER2=y +CONFIG_CRYPTO_AKCIPHER=y +CONFIG_CRYPTO_KPP2=y +CONFIG_CRYPTO_KPP=y +CONFIG_CRYPTO_ACOMP2=y +CONFIG_CRYPTO_MANAGER=y +CONFIG_CRYPTO_MANAGER2=y +CONFIG_CRYPTO_USER=m +CONFIG_CRYPTO_MANAGER_DISABLE_TESTS=y +CONFIG_CRYPTO_GF128MUL=y +CONFIG_CRYPTO_NULL=y +CONFIG_CRYPTO_NULL2=y +CONFIG_CRYPTO_PCRYPT=m +CONFIG_CRYPTO_CRYPTD=m +CONFIG_CRYPTO_AUTHENC=m +CONFIG_CRYPTO_TEST=m +CONFIG_CRYPTO_SIMD=m +CONFIG_CRYPTO_GLUE_HELPER_X86=m +CONFIG_CRYPTO_ENGINE=m + +# +# Public-key cryptography +# +CONFIG_CRYPTO_RSA=y +CONFIG_CRYPTO_DH=y +CONFIG_CRYPTO_ECC=m +CONFIG_CRYPTO_ECDH=m +CONFIG_CRYPTO_ECRDSA=m +CONFIG_CRYPTO_CURVE25519=m +CONFIG_CRYPTO_CURVE25519_X86=m + +# +# Authenticated Encryption with Associated Data +# +CONFIG_CRYPTO_CCM=m +CONFIG_CRYPTO_GCM=y +CONFIG_CRYPTO_CHACHA20POLY1305=m +CONFIG_CRYPTO_AEGIS128=m +CONFIG_CRYPTO_AEGIS128_AESNI_SSE2=m +CONFIG_CRYPTO_SEQIV=y +CONFIG_CRYPTO_ECHAINIV=m + +# +# Block modes +# +CONFIG_CRYPTO_CBC=y +CONFIG_CRYPTO_CFB=m +CONFIG_CRYPTO_CTR=y +CONFIG_CRYPTO_CTS=y +CONFIG_CRYPTO_ECB=y +CONFIG_CRYPTO_LRW=m +CONFIG_CRYPTO_OFB=m +CONFIG_CRYPTO_PCBC=m +CONFIG_CRYPTO_XTS=y +CONFIG_CRYPTO_KEYWRAP=m +CONFIG_CRYPTO_NHPOLY1305=m +CONFIG_CRYPTO_NHPOLY1305_SSE2=m +CONFIG_CRYPTO_NHPOLY1305_AVX2=m +CONFIG_CRYPTO_ADIANTUM=m +CONFIG_CRYPTO_ESSIV=m + +# +# Hash modes +# +CONFIG_CRYPTO_CMAC=m +CONFIG_CRYPTO_HMAC=y +CONFIG_CRYPTO_XCBC=m +CONFIG_CRYPTO_VMAC=m + +# +# Digest +# +CONFIG_CRYPTO_CRC32C=m +CONFIG_CRYPTO_CRC32C_INTEL=m +CONFIG_CRYPTO_CRC32=m +CONFIG_CRYPTO_CRC32_PCLMUL=m +CONFIG_CRYPTO_XXHASH=m +CONFIG_CRYPTO_BLAKE2B=m +CONFIG_CRYPTO_BLAKE2S=m +CONFIG_CRYPTO_BLAKE2S_X86=m +CONFIG_CRYPTO_CRCT10DIF=y +CONFIG_CRYPTO_CRCT10DIF_PCLMUL=m +CONFIG_CRYPTO_GHASH=y +CONFIG_CRYPTO_POLY1305=m +CONFIG_CRYPTO_POLY1305_X86_64=m +CONFIG_CRYPTO_MD4=m +CONFIG_CRYPTO_MD5=y +CONFIG_CRYPTO_MICHAEL_MIC=m +CONFIG_CRYPTO_RMD128=m +CONFIG_CRYPTO_RMD160=m +CONFIG_CRYPTO_RMD256=m +CONFIG_CRYPTO_RMD320=m +CONFIG_CRYPTO_SHA1=y +CONFIG_CRYPTO_SHA1_SSSE3=m +CONFIG_CRYPTO_SHA256_SSSE3=m +CONFIG_CRYPTO_SHA512_SSSE3=m +CONFIG_CRYPTO_SHA256=y +CONFIG_CRYPTO_SHA512=y +CONFIG_CRYPTO_SHA3=m +CONFIG_CRYPTO_SM3=m +CONFIG_CRYPTO_STREEBOG=m +CONFIG_CRYPTO_TGR192=m +CONFIG_CRYPTO_WP512=m +CONFIG_CRYPTO_GHASH_CLMUL_NI_INTEL=m + +# +# Ciphers +# +CONFIG_CRYPTO_AES=y +CONFIG_CRYPTO_AES_TI=m +CONFIG_CRYPTO_AES_NI_INTEL=m +CONFIG_CRYPTO_ANUBIS=m +CONFIG_CRYPTO_ARC4=m +CONFIG_CRYPTO_BLOWFISH=m +CONFIG_CRYPTO_BLOWFISH_COMMON=m +CONFIG_CRYPTO_BLOWFISH_X86_64=m +CONFIG_CRYPTO_CAMELLIA=m +CONFIG_CRYPTO_CAMELLIA_X86_64=m +CONFIG_CRYPTO_CAMELLIA_AESNI_AVX_X86_64=m +CONFIG_CRYPTO_CAMELLIA_AESNI_AVX2_X86_64=m +CONFIG_CRYPTO_CAST_COMMON=m +CONFIG_CRYPTO_CAST5=m +CONFIG_CRYPTO_CAST5_AVX_X86_64=m +CONFIG_CRYPTO_CAST6=m +CONFIG_CRYPTO_CAST6_AVX_X86_64=m +CONFIG_CRYPTO_DES=m +CONFIG_CRYPTO_DES3_EDE_X86_64=m +CONFIG_CRYPTO_FCRYPT=m +CONFIG_CRYPTO_KHAZAD=m +CONFIG_CRYPTO_SALSA20=m +CONFIG_CRYPTO_CHACHA20=m +CONFIG_CRYPTO_CHACHA20_X86_64=m +CONFIG_CRYPTO_SEED=m +CONFIG_CRYPTO_SERPENT=m +CONFIG_CRYPTO_SERPENT_SSE2_X86_64=m +CONFIG_CRYPTO_SERPENT_AVX_X86_64=m +CONFIG_CRYPTO_SERPENT_AVX2_X86_64=m +CONFIG_CRYPTO_SM4=m +CONFIG_CRYPTO_TEA=m +CONFIG_CRYPTO_TWOFISH=m +CONFIG_CRYPTO_TWOFISH_COMMON=m +CONFIG_CRYPTO_TWOFISH_X86_64=m +CONFIG_CRYPTO_TWOFISH_X86_64_3WAY=m +CONFIG_CRYPTO_TWOFISH_AVX_X86_64=m + +# +# Compression +# +CONFIG_CRYPTO_DEFLATE=m +CONFIG_CRYPTO_LZO=y +CONFIG_CRYPTO_842=m +CONFIG_CRYPTO_LZ4=m +CONFIG_CRYPTO_LZ4HC=m +CONFIG_CRYPTO_ZSTD=y + +# +# Random Number Generation +# +CONFIG_CRYPTO_ANSI_CPRNG=m +CONFIG_CRYPTO_DRBG_MENU=y +CONFIG_CRYPTO_DRBG_HMAC=y +CONFIG_CRYPTO_DRBG_HASH=y +CONFIG_CRYPTO_DRBG_CTR=y +CONFIG_CRYPTO_DRBG=y +CONFIG_CRYPTO_JITTERENTROPY=y +CONFIG_CRYPTO_USER_API=m +CONFIG_CRYPTO_USER_API_HASH=m +CONFIG_CRYPTO_USER_API_SKCIPHER=m +CONFIG_CRYPTO_USER_API_RNG=m +CONFIG_CRYPTO_USER_API_AEAD=m +# CONFIG_CRYPTO_STATS is not set +CONFIG_CRYPTO_HASH_INFO=y + +# +# Crypto library routines +# +CONFIG_CRYPTO_LIB_AES=y +CONFIG_CRYPTO_LIB_ARC4=m +CONFIG_CRYPTO_ARCH_HAVE_LIB_BLAKE2S=m +CONFIG_CRYPTO_LIB_BLAKE2S_GENERIC=m +CONFIG_CRYPTO_LIB_BLAKE2S=m +CONFIG_CRYPTO_ARCH_HAVE_LIB_CHACHA=m +CONFIG_CRYPTO_LIB_CHACHA_GENERIC=m +CONFIG_CRYPTO_LIB_CHACHA=m +CONFIG_CRYPTO_ARCH_HAVE_LIB_CURVE25519=m +CONFIG_CRYPTO_LIB_CURVE25519_GENERIC=m +CONFIG_CRYPTO_LIB_CURVE25519=m +CONFIG_CRYPTO_LIB_DES=m +CONFIG_CRYPTO_LIB_POLY1305_RSIZE=4 +CONFIG_CRYPTO_ARCH_HAVE_LIB_POLY1305=m +CONFIG_CRYPTO_LIB_POLY1305_GENERIC=m +CONFIG_CRYPTO_LIB_POLY1305=m +CONFIG_CRYPTO_LIB_CHACHA20POLY1305=m +CONFIG_CRYPTO_LIB_SHA256=y +CONFIG_CRYPTO_HW=y +CONFIG_CRYPTO_DEV_PADLOCK=m +CONFIG_CRYPTO_DEV_PADLOCK_AES=m +CONFIG_CRYPTO_DEV_PADLOCK_SHA=m +CONFIG_CRYPTO_DEV_ATMEL_I2C=m +CONFIG_CRYPTO_DEV_ATMEL_ECC=m +CONFIG_CRYPTO_DEV_ATMEL_SHA204A=m +CONFIG_CRYPTO_DEV_CCP=y +CONFIG_CRYPTO_DEV_CCP_DD=m +CONFIG_CRYPTO_DEV_SP_CCP=y +CONFIG_CRYPTO_DEV_CCP_CRYPTO=m +CONFIG_CRYPTO_DEV_SP_PSP=y +CONFIG_CRYPTO_DEV_CCP_DEBUGFS=y +CONFIG_CRYPTO_DEV_QAT=m +CONFIG_CRYPTO_DEV_QAT_DH895xCC=m +CONFIG_CRYPTO_DEV_QAT_C3XXX=m +CONFIG_CRYPTO_DEV_QAT_C62X=m +CONFIG_CRYPTO_DEV_QAT_DH895xCCVF=m +CONFIG_CRYPTO_DEV_QAT_C3XXXVF=m +CONFIG_CRYPTO_DEV_QAT_C62XVF=m +CONFIG_CRYPTO_DEV_NITROX=m +CONFIG_CRYPTO_DEV_NITROX_CNN55XX=m +CONFIG_CRYPTO_DEV_CHELSIO=m +CONFIG_CHELSIO_IPSEC_INLINE=y +CONFIG_CRYPTO_DEV_VIRTIO=m +CONFIG_CRYPTO_DEV_SAFEXCEL=m +CONFIG_CRYPTO_DEV_CCREE=m +CONFIG_CRYPTO_DEV_AMLOGIC_GXL=m +CONFIG_CRYPTO_DEV_AMLOGIC_GXL_DEBUG=y +CONFIG_ASYMMETRIC_KEY_TYPE=y +CONFIG_ASYMMETRIC_PUBLIC_KEY_SUBTYPE=y +CONFIG_ASYMMETRIC_TPM_KEY_SUBTYPE=m +CONFIG_X509_CERTIFICATE_PARSER=y +CONFIG_PKCS8_PRIVATE_KEY_PARSER=m +CONFIG_TPM_KEY_PARSER=m +CONFIG_PKCS7_MESSAGE_PARSER=y +# CONFIG_PKCS7_TEST_KEY is not set +CONFIG_SIGNED_PE_FILE_VERIFICATION=y + +# +# Certificates for signature checking +# +CONFIG_MODULE_SIG_KEY="certs/signing_key.pem" +CONFIG_SYSTEM_TRUSTED_KEYRING=y +CONFIG_SYSTEM_TRUSTED_KEYS="" +# CONFIG_SYSTEM_EXTRA_CERTIFICATE is not set +CONFIG_SECONDARY_TRUSTED_KEYRING=y +CONFIG_SYSTEM_BLACKLIST_KEYRING=y +CONFIG_SYSTEM_BLACKLIST_HASH_LIST="" +# end of Certificates for signature checking + +CONFIG_BINARY_PRINTF=y + +# +# Library routines +# +CONFIG_RAID6_PQ=m +CONFIG_RAID6_PQ_BENCHMARK=y +CONFIG_PACKING=y +CONFIG_BITREVERSE=y +CONFIG_GENERIC_STRNCPY_FROM_USER=y +CONFIG_GENERIC_STRNLEN_USER=y +CONFIG_GENERIC_NET_UTILS=y +CONFIG_GENERIC_FIND_FIRST_BIT=y +CONFIG_CORDIC=m +CONFIG_RATIONAL=y +CONFIG_GENERIC_PCI_IOMAP=y +CONFIG_GENERIC_IOMAP=y +CONFIG_ARCH_USE_CMPXCHG_LOCKREF=y +CONFIG_ARCH_HAS_FAST_MULTIPLIER=y +CONFIG_CRC_CCITT=y +CONFIG_CRC16=m +CONFIG_CRC_T10DIF=y +CONFIG_CRC_ITU_T=m +CONFIG_CRC32=y +# CONFIG_CRC32_SELFTEST is not set +CONFIG_CRC32_SLICEBY8=y +# CONFIG_CRC32_SLICEBY4 is not set +# CONFIG_CRC32_SARWATE is not set +# CONFIG_CRC32_BIT is not set +CONFIG_CRC64=m +CONFIG_CRC4=m +CONFIG_CRC7=m +CONFIG_LIBCRC32C=m +CONFIG_CRC8=m +CONFIG_XXHASH=y +# CONFIG_RANDOM32_SELFTEST is not set +CONFIG_842_COMPRESS=m +CONFIG_842_DECOMPRESS=m +CONFIG_ZLIB_INFLATE=y +CONFIG_ZLIB_DEFLATE=y +CONFIG_LZO_COMPRESS=y +CONFIG_LZO_DECOMPRESS=y +CONFIG_LZ4_COMPRESS=m +CONFIG_LZ4HC_COMPRESS=m +CONFIG_LZ4_DECOMPRESS=y +CONFIG_ZSTD_COMPRESS=y +CONFIG_ZSTD_DECOMPRESS=y +CONFIG_XZ_DEC=y +CONFIG_XZ_DEC_X86=y +CONFIG_XZ_DEC_POWERPC=y +CONFIG_XZ_DEC_IA64=y +CONFIG_XZ_DEC_ARM=y +CONFIG_XZ_DEC_ARMTHUMB=y +CONFIG_XZ_DEC_SPARC=y +CONFIG_XZ_DEC_BCJ=y +# CONFIG_XZ_DEC_TEST is not set +CONFIG_DECOMPRESS_GZIP=y +CONFIG_DECOMPRESS_BZIP2=y +CONFIG_DECOMPRESS_LZMA=y +CONFIG_DECOMPRESS_XZ=y +CONFIG_DECOMPRESS_LZO=y +CONFIG_DECOMPRESS_LZ4=y +CONFIG_GENERIC_ALLOCATOR=y +CONFIG_REED_SOLOMON=y +CONFIG_REED_SOLOMON_ENC8=y +CONFIG_REED_SOLOMON_DEC8=y +CONFIG_REED_SOLOMON_DEC16=y +CONFIG_BCH=m +CONFIG_TEXTSEARCH=y +CONFIG_TEXTSEARCH_KMP=m +CONFIG_TEXTSEARCH_BM=m +CONFIG_TEXTSEARCH_FSM=m +CONFIG_BTREE=y +CONFIG_INTERVAL_TREE=y +CONFIG_XARRAY_MULTI=y +CONFIG_ASSOCIATIVE_ARRAY=y +CONFIG_HAS_IOMEM=y +CONFIG_HAS_IOPORT_MAP=y +CONFIG_HAS_DMA=y +CONFIG_NEED_SG_DMA_LENGTH=y +CONFIG_NEED_DMA_MAP_STATE=y +CONFIG_ARCH_DMA_ADDR_T_64BIT=y +CONFIG_ARCH_HAS_FORCE_DMA_UNENCRYPTED=y +CONFIG_DMA_VIRT_OPS=y +CONFIG_SWIOTLB=y +# CONFIG_DMA_API_DEBUG is not set +CONFIG_SGL_ALLOC=y +CONFIG_IOMMU_HELPER=y +CONFIG_CHECK_SIGNATURE=y +CONFIG_CPU_RMAP=y +CONFIG_DQL=y +CONFIG_GLOB=y +# CONFIG_GLOB_SELFTEST is not set +CONFIG_NLATTR=y +CONFIG_LRU_CACHE=m +CONFIG_CLZ_TAB=y +CONFIG_IRQ_POLL=y +CONFIG_MPILIB=y +CONFIG_DIMLIB=y +CONFIG_LIBFDT=y +CONFIG_OID_REGISTRY=y +CONFIG_UCS2_STRING=y +CONFIG_HAVE_GENERIC_VDSO=y +CONFIG_GENERIC_GETTIMEOFDAY=y +CONFIG_FONT_SUPPORT=y +CONFIG_FONTS=y +# CONFIG_FONT_8x8 is not set +CONFIG_FONT_8x16=y +# CONFIG_FONT_6x11 is not set +# CONFIG_FONT_7x14 is not set +# CONFIG_FONT_PEARL_8x8 is not set +# CONFIG_FONT_ACORN_8x8 is not set +# CONFIG_FONT_MINI_4x6 is not set +# CONFIG_FONT_6x10 is not set +# CONFIG_FONT_10x18 is not set +# CONFIG_FONT_SUN8x16 is not set +# CONFIG_FONT_SUN12x22 is not set +CONFIG_FONT_TER16x32=y +CONFIG_SG_POOL=y +CONFIG_ARCH_HAS_PMEM_API=y +CONFIG_MEMREGION=y +CONFIG_ARCH_HAS_UACCESS_FLUSHCACHE=y +CONFIG_ARCH_HAS_UACCESS_MCSAFE=y +CONFIG_ARCH_STACKWALK=y +CONFIG_SBITMAP=y +CONFIG_PARMAN=m +CONFIG_OBJAGG=m +# CONFIG_STRING_SELFTEST is not set +# end of Library routines + +# +# Kernel hacking +# + +# +# printk and dmesg options +# +CONFIG_PRINTK_TIME=y +# CONFIG_PRINTK_CALLER is not set +CONFIG_CONSOLE_LOGLEVEL_DEFAULT=4 +CONFIG_CONSOLE_LOGLEVEL_QUIET=1 +CONFIG_MESSAGE_LOGLEVEL_DEFAULT=4 +# CONFIG_BOOT_PRINTK_DELAY is not set +CONFIG_DYNAMIC_DEBUG=y +CONFIG_SYMBOLIC_ERRNAME=y +CONFIG_DEBUG_BUGVERBOSE=y +# end of printk and dmesg options + +# +# Compile-time checks and compiler options +# +# CONFIG_DEBUG_INFO is not set +# CONFIG_ENABLE_MUST_CHECK is not set +CONFIG_FRAME_WARN=2048 +CONFIG_STRIP_ASM_SYMS=y +# CONFIG_READABLE_ASM is not set +# CONFIG_HEADERS_INSTALL is not set +CONFIG_OPTIMIZE_INLINING=y +# CONFIG_DEBUG_SECTION_MISMATCH is not set +CONFIG_SECTION_MISMATCH_WARN_ONLY=y +CONFIG_STACK_VALIDATION=y +# CONFIG_DEBUG_FORCE_WEAK_PER_CPU is not set +# end of Compile-time checks and compiler options + +# +# Generic Kernel Debugging Instruments +# +CONFIG_MAGIC_SYSRQ=y +CONFIG_MAGIC_SYSRQ_DEFAULT_ENABLE=0x0 +CONFIG_MAGIC_SYSRQ_SERIAL=y +CONFIG_DEBUG_FS=y +CONFIG_HAVE_ARCH_KGDB=y +# CONFIG_KGDB is not set +CONFIG_ARCH_HAS_UBSAN_SANITIZE_ALL=y +# CONFIG_UBSAN is not set +CONFIG_UBSAN_ALIGNMENT=y +# end of Generic Kernel Debugging Instruments + +CONFIG_DEBUG_KERNEL=y +CONFIG_DEBUG_MISC=y + +# +# Memory Debugging +# +# CONFIG_PAGE_EXTENSION is not set +# CONFIG_DEBUG_PAGEALLOC is not set +# CONFIG_PAGE_OWNER is not set +CONFIG_PAGE_POISONING=y +CONFIG_PAGE_POISONING_NO_SANITY=y +CONFIG_PAGE_POISONING_ZERO=y +# CONFIG_DEBUG_PAGE_REF is not set +# CONFIG_DEBUG_RODATA_TEST is not set +# CONFIG_DEBUG_OBJECTS is not set +# CONFIG_SLUB_DEBUG_ON is not set +# CONFIG_SLUB_STATS is not set +CONFIG_HAVE_DEBUG_KMEMLEAK=y +# CONFIG_DEBUG_KMEMLEAK is not set +# CONFIG_DEBUG_STACK_USAGE is not set +CONFIG_SCHED_STACK_END_CHECK=y +# CONFIG_DEBUG_VM is not set +CONFIG_ARCH_HAS_DEBUG_VIRTUAL=y +# CONFIG_DEBUG_VIRTUAL is not set +CONFIG_DEBUG_MEMORY_INIT=y +# CONFIG_DEBUG_PER_CPU_MAPS is not set +CONFIG_HAVE_ARCH_KASAN=y +CONFIG_HAVE_ARCH_KASAN_VMALLOC=y +CONFIG_CC_HAS_KASAN_GENERIC=y +# CONFIG_KASAN is not set +CONFIG_KASAN_STACK=1 +# end of Memory Debugging + +# CONFIG_DEBUG_SHIRQ is not set + +# +# Debug Oops, Lockups and Hangs +# +# CONFIG_PANIC_ON_OOPS is not set +CONFIG_PANIC_ON_OOPS_VALUE=0 +CONFIG_PANIC_TIMEOUT=0 +CONFIG_LOCKUP_DETECTOR=y +CONFIG_SOFTLOCKUP_DETECTOR=y +# CONFIG_BOOTPARAM_SOFTLOCKUP_PANIC is not set +CONFIG_BOOTPARAM_SOFTLOCKUP_PANIC_VALUE=0 +CONFIG_HARDLOCKUP_DETECTOR_PERF=y +CONFIG_HARDLOCKUP_CHECK_TIMESTAMP=y +CONFIG_HARDLOCKUP_DETECTOR=y +# CONFIG_BOOTPARAM_HARDLOCKUP_PANIC is not set +CONFIG_BOOTPARAM_HARDLOCKUP_PANIC_VALUE=0 +CONFIG_DETECT_HUNG_TASK=y +CONFIG_DEFAULT_HUNG_TASK_TIMEOUT=120 +# CONFIG_BOOTPARAM_HUNG_TASK_PANIC is not set +CONFIG_BOOTPARAM_HUNG_TASK_PANIC_VALUE=0 +# CONFIG_WQ_WATCHDOG is not set +# end of Debug Oops, Lockups and Hangs + +# +# Scheduler Debugging +# +CONFIG_SCHED_DEBUG=y +CONFIG_SCHED_INFO=y +CONFIG_SCHEDSTATS=y +# end of Scheduler Debugging + +# CONFIG_DEBUG_TIMEKEEPING is not set +CONFIG_DEBUG_PREEMPT=y + +# +# Lock Debugging (spinlocks, mutexes, etc...) +# +CONFIG_LOCK_DEBUGGING_SUPPORT=y +# CONFIG_PROVE_LOCKING is not set +# CONFIG_LOCK_STAT is not set +# CONFIG_DEBUG_RT_MUTEXES is not set +# CONFIG_DEBUG_SPINLOCK is not set +# CONFIG_DEBUG_MUTEXES is not set +# CONFIG_DEBUG_WW_MUTEX_SLOWPATH is not set +# CONFIG_DEBUG_RWSEMS is not set +# CONFIG_DEBUG_LOCK_ALLOC is not set +# CONFIG_DEBUG_ATOMIC_SLEEP is not set +# CONFIG_DEBUG_LOCKING_API_SELFTESTS is not set +# CONFIG_LOCK_TORTURE_TEST is not set +# CONFIG_WW_MUTEX_SELFTEST is not set +# end of Lock Debugging (spinlocks, mutexes, etc...) + +CONFIG_STACKTRACE=y +# CONFIG_WARN_ALL_UNSEEDED_RANDOM is not set +# CONFIG_DEBUG_KOBJECT is not set + +# +# Debug kernel data structures +# +# CONFIG_DEBUG_LIST is not set +# CONFIG_DEBUG_PLIST is not set +# CONFIG_DEBUG_SG is not set +# CONFIG_DEBUG_NOTIFIERS is not set +# CONFIG_BUG_ON_DATA_CORRUPTION is not set +# end of Debug kernel data structures + +# CONFIG_DEBUG_CREDENTIALS is not set + +# +# RCU Debugging +# +# CONFIG_RCU_PERF_TEST is not set +# CONFIG_RCU_TORTURE_TEST is not set +CONFIG_RCU_CPU_STALL_TIMEOUT=60 +# CONFIG_RCU_TRACE is not set +# CONFIG_RCU_EQS_DEBUG is not set +# end of RCU Debugging + +# CONFIG_DEBUG_WQ_FORCE_RR_CPU is not set +# CONFIG_DEBUG_BLOCK_EXT_DEVT is not set +# CONFIG_CPU_HOTPLUG_STATE_CONTROL is not set +CONFIG_LATENCYTOP=y +CONFIG_USER_STACKTRACE_SUPPORT=y +CONFIG_NOP_TRACER=y +CONFIG_HAVE_FUNCTION_TRACER=y +CONFIG_HAVE_FUNCTION_GRAPH_TRACER=y +CONFIG_HAVE_DYNAMIC_FTRACE=y +CONFIG_HAVE_DYNAMIC_FTRACE_WITH_REGS=y +CONFIG_HAVE_DYNAMIC_FTRACE_WITH_DIRECT_CALLS=y +CONFIG_HAVE_FTRACE_MCOUNT_RECORD=y +CONFIG_HAVE_SYSCALL_TRACEPOINTS=y +CONFIG_HAVE_FENTRY=y +CONFIG_HAVE_C_RECORDMCOUNT=y +CONFIG_TRACER_MAX_TRACE=y +CONFIG_TRACE_CLOCK=y +CONFIG_RING_BUFFER=y +CONFIG_EVENT_TRACING=y +CONFIG_CONTEXT_SWITCH_TRACER=y +CONFIG_RING_BUFFER_ALLOW_SWAP=y +CONFIG_TRACING=y +CONFIG_GENERIC_TRACER=y +CONFIG_TRACING_SUPPORT=y +CONFIG_FTRACE=y +CONFIG_FUNCTION_TRACER=y +CONFIG_FUNCTION_GRAPH_TRACER=y +# CONFIG_PREEMPTIRQ_EVENTS is not set +# CONFIG_IRQSOFF_TRACER is not set +# CONFIG_PREEMPT_TRACER is not set +CONFIG_SCHED_TRACER=y +CONFIG_HWLAT_TRACER=y +CONFIG_FTRACE_SYSCALLS=y +CONFIG_TRACER_SNAPSHOT=y +# CONFIG_TRACER_SNAPSHOT_PER_CPU_SWAP is not set +CONFIG_BRANCH_PROFILE_NONE=y +# CONFIG_PROFILE_ANNOTATED_BRANCHES is not set +CONFIG_STACK_TRACER=y +CONFIG_BLK_DEV_IO_TRACE=y +CONFIG_KPROBE_EVENTS=y +# CONFIG_KPROBE_EVENTS_ON_NOTRACE is not set +CONFIG_UPROBE_EVENTS=y +CONFIG_BPF_EVENTS=y +CONFIG_DYNAMIC_EVENTS=y +CONFIG_PROBE_EVENTS=y +CONFIG_DYNAMIC_FTRACE=y +CONFIG_DYNAMIC_FTRACE_WITH_REGS=y +CONFIG_DYNAMIC_FTRACE_WITH_DIRECT_CALLS=y +CONFIG_FUNCTION_PROFILER=y +CONFIG_BPF_KPROBE_OVERRIDE=y +CONFIG_FTRACE_MCOUNT_RECORD=y +# CONFIG_FTRACE_STARTUP_TEST is not set +CONFIG_MMIOTRACE=y +CONFIG_TRACING_MAP=y +CONFIG_HIST_TRIGGERS=y +# CONFIG_TRACE_EVENT_INJECT is not set +# CONFIG_MMIOTRACE_TEST is not set +# CONFIG_TRACEPOINT_BENCHMARK is not set +# CONFIG_RING_BUFFER_BENCHMARK is not set +# CONFIG_RING_BUFFER_STARTUP_TEST is not set +# CONFIG_PREEMPTIRQ_DELAY_TEST is not set +# CONFIG_TRACE_EVAL_MAP_FILE is not set +# CONFIG_PROVIDE_OHCI1394_DMA_INIT is not set +# CONFIG_SAMPLES is not set +CONFIG_ARCH_HAS_DEVMEM_IS_ALLOWED=y +CONFIG_STRICT_DEVMEM=y +CONFIG_IO_STRICT_DEVMEM=y + +# +# x86 Debugging +# +CONFIG_TRACE_IRQFLAGS_SUPPORT=y +# CONFIG_X86_VERBOSE_BOOTUP is not set +CONFIG_EARLY_PRINTK=y +# CONFIG_EARLY_PRINTK_DBGP is not set +# CONFIG_EARLY_PRINTK_USB_XDBC is not set +CONFIG_X86_PTDUMP_CORE=y +# CONFIG_X86_PTDUMP is not set +# CONFIG_EFI_PGT_DUMP is not set +CONFIG_DEBUG_WX=y +CONFIG_DOUBLEFAULT=y +# CONFIG_DEBUG_TLBFLUSH is not set +# CONFIG_IOMMU_DEBUG is not set +CONFIG_HAVE_MMIOTRACE_SUPPORT=y +# CONFIG_X86_DECODER_SELFTEST is not set +CONFIG_IO_DELAY_0X80=y +# CONFIG_IO_DELAY_0XED is not set +# CONFIG_IO_DELAY_UDELAY is not set +# CONFIG_IO_DELAY_NONE is not set +CONFIG_DEBUG_BOOT_PARAMS=y +# CONFIG_CPA_DEBUG is not set +# CONFIG_DEBUG_ENTRY is not set +# CONFIG_DEBUG_NMI_SELFTEST is not set +# CONFIG_X86_DEBUG_FPU is not set +# CONFIG_PUNIT_ATOM_DEBUG is not set +CONFIG_UNWINDER_ORC=y +# CONFIG_UNWINDER_FRAME_POINTER is not set +# CONFIG_UNWINDER_GUESS is not set +# end of x86 Debugging + +# +# Kernel Testing and Coverage +# +# CONFIG_KUNIT is not set +# CONFIG_NOTIFIER_ERROR_INJECTION is not set +CONFIG_FUNCTION_ERROR_INJECTION=y +# CONFIG_FAULT_INJECTION is not set +CONFIG_ARCH_HAS_KCOV=y +CONFIG_CC_HAS_SANCOV_TRACE_PC=y +# CONFIG_KCOV is not set +CONFIG_RUNTIME_TESTING_MENU=y +CONFIG_LKDTM=m +# CONFIG_TEST_LIST_SORT is not set +# CONFIG_TEST_SORT is not set +# CONFIG_KPROBES_SANITY_TEST is not set +# CONFIG_BACKTRACE_SELF_TEST is not set +# CONFIG_RBTREE_TEST is not set +# CONFIG_REED_SOLOMON_TEST is not set +# CONFIG_INTERVAL_TREE_TEST is not set +# CONFIG_PERCPU_TEST is not set +# CONFIG_ATOMIC64_SELFTEST is not set +# CONFIG_ASYNC_RAID6_TEST is not set +# CONFIG_TEST_HEXDUMP is not set +# CONFIG_TEST_STRING_HELPERS is not set +# CONFIG_TEST_STRSCPY is not set +# CONFIG_TEST_KSTRTOX is not set +# CONFIG_TEST_PRINTF is not set +# CONFIG_TEST_BITMAP is not set +# CONFIG_TEST_BITFIELD is not set +# CONFIG_TEST_UUID is not set +# CONFIG_TEST_XARRAY is not set +# CONFIG_TEST_OVERFLOW is not set +# CONFIG_TEST_RHASHTABLE is not set +# CONFIG_TEST_HASH is not set +# CONFIG_TEST_IDA is not set +# CONFIG_TEST_PARMAN is not set +# CONFIG_TEST_LKM is not set +# CONFIG_TEST_VMALLOC is not set +# CONFIG_TEST_USER_COPY is not set +# CONFIG_TEST_BPF is not set +# CONFIG_TEST_BLACKHOLE_DEV is not set +# CONFIG_FIND_BIT_BENCHMARK is not set +# CONFIG_TEST_FIRMWARE is not set +# CONFIG_TEST_SYSCTL is not set +# CONFIG_TEST_UDELAY is not set +# CONFIG_TEST_STATIC_KEYS is not set +# CONFIG_TEST_KMOD is not set +# CONFIG_TEST_MEMCAT_P is not set +# CONFIG_TEST_OBJAGG is not set +# CONFIG_TEST_STACKINIT is not set +# CONFIG_TEST_MEMINIT is not set +# CONFIG_MEMTEST is not set +# CONFIG_HYPERV_TESTING is not set +# end of Kernel Testing and Coverage +# end of Kernel hacking diff --git a/linux55-tkg/linux55-tkg-config/generic-desktop-profile.cfg b/linux55-tkg/linux55-tkg-config/generic-desktop-profile.cfg new file mode 100644 index 0000000..97049ce --- /dev/null +++ b/linux55-tkg/linux55-tkg-config/generic-desktop-profile.cfg @@ -0,0 +1,48 @@ +# linux55-TkG config file +# Generic Desktop + + +#### MISC OPTIONS #### + +# External config file to use - If the given file exists in path, it will override default config (customization.cfg) - Default is ~/.config/frogminer/linux50-tkg.cfg +_EXT_CONFIG_PATH=~/.config/frogminer/linux55-tkg.cfg + +#### KERNEL OPTIONS #### + +# Name of the default config file to use from the linux???-tkg-config folder. Arch default is "config.x86_64". +_configfile="config.x86_64" + +# Disable some non-module debugging - See PKGBUILD for the list +_debugdisable="false" + +# LEAVE AN EMPTY VALUE TO BE PROMPTED ABOUT FOLLOWING OPTIONS AT BUILD TIME + +# Set to "true" to disable FUNCTION_TRACER/GRAPH_TRACER, lowering overhead but limiting debugging and analyzing of kernel functions - Kernel default is "false" +_ftracedisable="false" + +# Set to "true" to disable NUMA, lowering overhead, but breaking CUDA/NvEnc on Nvidia equipped systems - Kernel default is "false" +_numadisable="false" + +# Set to "true" to use explicit preemption points to lower latency at the cost of a small throughput loss - Can give a nice perf boost in VMs - Kernel default is "false" +_voluntary_preempt="false" + +# A selection of patches from Zen/Liquorix kernel and additional tweaks for a better gaming experience (ZENIFY) - Default is "true" +_zenify="true" + +# compiler optimization level - 1. Optimize for performance (-O2); 2. Optimize harder (-O3); 3. Optimize for size (-Os) - Kernel default is "2" +_compileroptlevel="1" + +# Trust the CPU manufacturer to initialize Linux's CRNG (RANDOM_TRUST_CPU) - Kernel default is "false" +_random_trust_cpu="false" + + +#### USER PATCHES #### + +# You can use your own patches by putting them in the same folder as the PKGBUILD and giving them the .mypatch extension. +# You can also revert patches by putting them in the same folder as the PKGBUILD and giving them the .myrevert extension. + +# Also, userpatches variable below must be set to true for the above to work. +_user_patches="true" + +# Apply all user patches without confirmation - !!! NOT RECOMMENDED !!! +_user_patches_no_confirm="false" diff --git a/linux55-tkg/linux55-tkg-config/ryzen-desktop-profile.cfg b/linux55-tkg/linux55-tkg-config/ryzen-desktop-profile.cfg new file mode 100644 index 0000000..c1f408a --- /dev/null +++ b/linux55-tkg/linux55-tkg-config/ryzen-desktop-profile.cfg @@ -0,0 +1,51 @@ +# linux55-TkG config file +# Ryzen Desktop + + +#### MISC OPTIONS #### + +# External config file to use - If the given file exists in path, it will override default config (customization.cfg) - Default is ~/.config/frogminer/linux52-tkg.cfg +_EXT_CONFIG_PATH=~/.config/frogminer/linux55-tkg.cfg + +#### KERNEL OPTIONS #### + +# Name of the default config file to use from the linux???-tkg-config folder. Arch default is "config.x86_64". +_configfile="config.x86_64" + +# Disable some non-module debugging - See PKGBUILD for the list +_debugdisable="false" + +# LEAVE AN EMPTY VALUE TO BE PROMPTED ABOUT FOLLOWING OPTIONS AT BUILD TIME + +# Set to "true" to disable FUNCTION_TRACER/GRAPH_TRACER, lowering overhead but limiting debugging and analyzing of kernel functions - Kernel default is "false" +_ftracedisable="false" + +# Set to "true" to disable NUMA, lowering overhead, but breaking CUDA/NvEnc on Nvidia equipped systems - Kernel default is "false" +_numadisable="false" + +# Set to "true" to use explicit preemption points to lower latency at the cost of a small throughput loss - Can give a nice perf boost in VMs - Kernel default is "false" +_voluntary_preempt="false" + +# A selection of patches from Zen/Liquorix kernel and additional tweaks for a better gaming experience (ZENIFY) - Default is "true" +_zenify="true" + +# compiler optimization level - 1. Optimize for performance (-O2); 2. Optimize harder (-O3); 3. Optimize for size (-Os) - Kernel default is "2" +_compileroptlevel="1" + +# Trust the CPU manufacturer to initialize Linux's CRNG (RANDOM_TRUST_CPU) - Kernel default is "false" +_random_trust_cpu="false" + +# Default CPU governor - "performance", "ondemand" (tweaked), "schedutil" or leave empty for default (schedutil on AMD and legacy Intel, intel_pstate on modern Intel) - Enforcing an option will disable intel_pstate altogether! +_default_cpu_gov="performance" + + +#### USER PATCHES #### + +# You can use your own patches by putting them in the same folder as the PKGBUILD and giving them the .mypatch extension. +# You can also revert patches by putting them in the same folder as the PKGBUILD and giving them the .myrevert extension. + +# Also, userpatches variable below must be set to true for the above to work. +_user_patches="true" + +# Apply all user patches without confirmation - !!! NOT RECOMMENDED !!! +_user_patches_no_confirm="false" diff --git a/linux55-tkg/linux55-tkg-patches/0001-add-sysctl-to-disallow-unprivileged-CLONE_NEWUSER-by.patch b/linux55-tkg/linux55-tkg-patches/0001-add-sysctl-to-disallow-unprivileged-CLONE_NEWUSER-by.patch new file mode 100644 index 0000000..3cef558 --- /dev/null +++ b/linux55-tkg/linux55-tkg-patches/0001-add-sysctl-to-disallow-unprivileged-CLONE_NEWUSER-by.patch @@ -0,0 +1,156 @@ +From 5ec2dd3a095442ec1a21d86042a4994f2ba24e63 Mon Sep 17 00:00:00 2001 +Message-Id: <5ec2dd3a095442ec1a21d86042a4994f2ba24e63.1512651251.git.jan.steffens@gmail.com> +From: Serge Hallyn +Date: Fri, 31 May 2013 19:12:12 +0100 +Subject: [PATCH] add sysctl to disallow unprivileged CLONE_NEWUSER by default + +Signed-off-by: Serge Hallyn +[bwh: Remove unneeded binary sysctl bits] +Signed-off-by: Daniel Micay +--- + kernel/fork.c | 15 +++++++++++++++ + kernel/sysctl.c | 12 ++++++++++++ + kernel/user_namespace.c | 3 +++ + 3 files changed, 30 insertions(+) + +diff --git a/kernel/fork.c b/kernel/fork.c +index 07cc743698d3668e..4011d68a8ff9305c 100644 +--- a/kernel/fork.c ++++ b/kernel/fork.c +@@ -102,6 +102,11 @@ + + #define CREATE_TRACE_POINTS + #include ++#ifdef CONFIG_USER_NS ++extern int unprivileged_userns_clone; ++#else ++#define unprivileged_userns_clone 0 ++#endif + + /* + * Minimum number of threads to boot the kernel +@@ -1555,6 +1560,10 @@ static __latent_entropy struct task_struct *copy_process( + if ((clone_flags & (CLONE_NEWUSER|CLONE_FS)) == (CLONE_NEWUSER|CLONE_FS)) + return ERR_PTR(-EINVAL); + ++ if ((clone_flags & CLONE_NEWUSER) && !unprivileged_userns_clone) ++ if (!capable(CAP_SYS_ADMIN)) ++ return ERR_PTR(-EPERM); ++ + /* + * Thread groups must share signals as well, and detached threads + * can only be started up within the thread group. +@@ -2348,6 +2357,12 @@ SYSCALL_DEFINE1(unshare, unsigned long, unshare_flags) + if (unshare_flags & CLONE_NEWNS) + unshare_flags |= CLONE_FS; + ++ if ((unshare_flags & CLONE_NEWUSER) && !unprivileged_userns_clone) { ++ err = -EPERM; ++ if (!capable(CAP_SYS_ADMIN)) ++ goto bad_unshare_out; ++ } ++ + err = check_unshare_flags(unshare_flags); + if (err) + goto bad_unshare_out; +diff --git a/kernel/sysctl.c b/kernel/sysctl.c +index b86520ed3fb60fbf..f7dab3760839f1a1 100644 +--- a/kernel/sysctl.c ++++ b/kernel/sysctl.c +@@ -105,6 +105,9 @@ extern int core_uses_pid; + extern char core_pattern[]; + extern unsigned int core_pipe_limit; + #endif ++#ifdef CONFIG_USER_NS ++extern int unprivileged_userns_clone; ++#endif + extern int pid_max; + extern int pid_max_min, pid_max_max; + extern int percpu_pagelist_fraction; +@@ -513,6 +516,15 @@ static struct ctl_table kern_table[] = { + .proc_handler = proc_dointvec, + }, + #endif ++#ifdef CONFIG_USER_NS ++ { ++ .procname = "unprivileged_userns_clone", ++ .data = &unprivileged_userns_clone, ++ .maxlen = sizeof(int), ++ .mode = 0644, ++ .proc_handler = proc_dointvec, ++ }, ++#endif + #ifdef CONFIG_PROC_SYSCTL + { + .procname = "tainted", +diff --git a/kernel/user_namespace.c b/kernel/user_namespace.c +index c490f1e4313b998a..dd03bd39d7bf194d 100644 +--- a/kernel/user_namespace.c ++++ b/kernel/user_namespace.c +@@ -24,6 +24,9 @@ + #include + #include + ++/* sysctl */ ++int unprivileged_userns_clone; ++ + static struct kmem_cache *user_ns_cachep __read_mostly; + static DEFINE_MUTEX(userns_state_mutex); + +-- +2.15.1 + +From b5202296055dd333db4425120d3f93ef4e6a0573 Mon Sep 17 00:00:00 2001 +From: "Jan Alexander Steffens (heftig)" +Date: Thu, 7 Dec 2017 13:50:48 +0100 +Subject: ZEN: Add CONFIG for unprivileged_userns_clone + +This way our default behavior continues to match the vanilla kernel. +--- + init/Kconfig | 16 ++++++++++++++++ + kernel/user_namespace.c | 4 ++++ + 2 files changed, 20 insertions(+) + +diff --git a/init/Kconfig b/init/Kconfig +index 4592bf7997c0..f3df02990aff 100644 +--- a/init/Kconfig ++++ b/init/Kconfig +@@ -1004,6 +1004,22 @@ config USER_NS + + If unsure, say N. + ++config USER_NS_UNPRIVILEGED ++ bool "Allow unprivileged users to create namespaces" ++ default y ++ depends on USER_NS ++ help ++ When disabled, unprivileged users will not be able to create ++ new namespaces. Allowing users to create their own namespaces ++ has been part of several recent local privilege escalation ++ exploits, so if you need user namespaces but are ++ paranoid^Wsecurity-conscious you want to disable this. ++ ++ This setting can be overridden at runtime via the ++ kernel.unprivileged_userns_clone sysctl. ++ ++ If unsure, say Y. ++ + config PID_NS + bool "PID Namespaces" + default y +diff --git a/kernel/user_namespace.c b/kernel/user_namespace.c +index 6b9dbc257e34..107b17f0d528 100644 +--- a/kernel/user_namespace.c ++++ b/kernel/user_namespace.c +@@ -27,7 +27,11 @@ + #include + + /* sysctl */ ++#ifdef CONFIG_USER_NS_UNPRIVILEGED ++int unprivileged_userns_clone = 1; ++#else + int unprivileged_userns_clone; ++#endif + + static struct kmem_cache *user_ns_cachep __read_mostly; + static DEFINE_MUTEX(userns_state_mutex); diff --git a/linux55-tkg/linux55-tkg-patches/0002-clear-patches.patch b/linux55-tkg/linux55-tkg-patches/0002-clear-patches.patch new file mode 100644 index 0000000..a7c9d4a --- /dev/null +++ b/linux55-tkg/linux55-tkg-patches/0002-clear-patches.patch @@ -0,0 +1,354 @@ +From 2ac70785613ef4c6b16414986bb18bd7b60d2a13 Mon Sep 17 00:00:00 2001 +From: Arjan van de Ven +Date: Mon, 14 Mar 2016 11:10:58 -0600 +Subject: [PATCH] pci pme wakeups + +Reduce wakeups for PME checks, which are a workaround for miswired +boards (sadly, too many of them) in laptops. +--- + drivers/pci/pci.c | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +diff --git a/drivers/pci/pci.c b/drivers/pci/pci.c +index c25acace7d91..0ddebdad9f5b 100644 +--- a/drivers/pci/pci.c ++++ b/drivers/pci/pci.c +@@ -61,7 +61,7 @@ struct pci_pme_device { + struct pci_dev *dev; + }; + +-#define PME_TIMEOUT 1000 /* How long between PME checks */ ++#define PME_TIMEOUT 4000 /* How long between PME checks */ + + static void pci_dev_d3_sleep(struct pci_dev *dev) + { +-- +2.20.1 + +From 7e7e36c67aa71d6a1ec5676d99d37c1fea389ceb Mon Sep 17 00:00:00 2001 +From: Arjan van de Ven +Date: Sat, 19 Mar 2016 21:32:19 -0400 +Subject: [PATCH] intel_idle: tweak cpuidle cstates + +Increase target_residency in cpuidle cstate + +Tune intel_idle to be a bit less agressive; +Clear linux is cleaner in hygiene (wakupes) than the average linux, +so we can afford changing these in a way that increases +performance while keeping power efficiency +--- + drivers/idle/intel_idle.c | 44 +++++++++++++++++++-------------------- + 1 file changed, 22 insertions(+), 22 deletions(-) + +diff --git a/drivers/idle/intel_idle.c b/drivers/idle/intel_idle.c +index 8b5d85c91e9d..5e2d813a048d 100644 +--- a/drivers/idle/intel_idle.c ++++ b/drivers/idle/intel_idle.c +@@ -466,7 +466,7 @@ static struct cpuidle_state hsw_cstates[] = { + .desc = "MWAIT 0x01", + .flags = MWAIT2flg(0x01), + .exit_latency = 10, +- .target_residency = 20, ++ .target_residency = 120, + .enter = &intel_idle, + .enter_s2idle = intel_idle_s2idle, }, + { +@@ -474,7 +474,7 @@ static struct cpuidle_state hsw_cstates[] = { + .desc = "MWAIT 0x10", + .flags = MWAIT2flg(0x10) | CPUIDLE_FLAG_TLB_FLUSHED, + .exit_latency = 33, +- .target_residency = 100, ++ .target_residency = 900, + .enter = &intel_idle, + .enter_s2idle = intel_idle_s2idle, }, + { +@@ -482,7 +482,7 @@ static struct cpuidle_state hsw_cstates[] = { + .desc = "MWAIT 0x20", + .flags = MWAIT2flg(0x20) | CPUIDLE_FLAG_TLB_FLUSHED, + .exit_latency = 133, +- .target_residency = 400, ++ .target_residency = 1000, + .enter = &intel_idle, + .enter_s2idle = intel_idle_s2idle, }, + { +@@ -490,7 +490,7 @@ static struct cpuidle_state hsw_cstates[] = { + .desc = "MWAIT 0x32", + .flags = MWAIT2flg(0x32) | CPUIDLE_FLAG_TLB_FLUSHED, + .exit_latency = 166, +- .target_residency = 500, ++ .target_residency = 1500, + .enter = &intel_idle, + .enter_s2idle = intel_idle_s2idle, }, + { +@@ -498,7 +498,7 @@ static struct cpuidle_state hsw_cstates[] = { + .desc = "MWAIT 0x40", + .flags = MWAIT2flg(0x40) | CPUIDLE_FLAG_TLB_FLUSHED, + .exit_latency = 300, +- .target_residency = 900, ++ .target_residency = 2000, + .enter = &intel_idle, + .enter_s2idle = intel_idle_s2idle, }, + { +@@ -506,7 +506,7 @@ static struct cpuidle_state hsw_cstates[] = { + .desc = "MWAIT 0x50", + .flags = MWAIT2flg(0x50) | CPUIDLE_FLAG_TLB_FLUSHED, + .exit_latency = 600, +- .target_residency = 1800, ++ .target_residency = 5000, + .enter = &intel_idle, + .enter_s2idle = intel_idle_s2idle, }, + { +@@ -514,7 +514,7 @@ static struct cpuidle_state hsw_cstates[] = { + .desc = "MWAIT 0x60", + .flags = MWAIT2flg(0x60) | CPUIDLE_FLAG_TLB_FLUSHED, + .exit_latency = 2600, +- .target_residency = 7700, ++ .target_residency = 9000, + .enter = &intel_idle, + .enter_s2idle = intel_idle_s2idle, }, + { +@@ -534,7 +534,7 @@ static struct cpuidle_state bdw_cstates[] = { + .desc = "MWAIT 0x01", + .flags = MWAIT2flg(0x01), + .exit_latency = 10, +- .target_residency = 20, ++ .target_residency = 120, + .enter = &intel_idle, + .enter_s2idle = intel_idle_s2idle, }, + { +@@ -542,7 +542,7 @@ static struct cpuidle_state bdw_cstates[] = { + .desc = "MWAIT 0x10", + .flags = MWAIT2flg(0x10) | CPUIDLE_FLAG_TLB_FLUSHED, + .exit_latency = 40, +- .target_residency = 100, ++ .target_residency = 1000, + .enter = &intel_idle, + .enter_s2idle = intel_idle_s2idle, }, + { +@@ -550,7 +550,7 @@ static struct cpuidle_state bdw_cstates[] = { + .desc = "MWAIT 0x20", + .flags = MWAIT2flg(0x20) | CPUIDLE_FLAG_TLB_FLUSHED, + .exit_latency = 133, +- .target_residency = 400, ++ .target_residency = 1000, + .enter = &intel_idle, + .enter_s2idle = intel_idle_s2idle, }, + { +@@ -558,7 +558,7 @@ static struct cpuidle_state bdw_cstates[] = { + .desc = "MWAIT 0x32", + .flags = MWAIT2flg(0x32) | CPUIDLE_FLAG_TLB_FLUSHED, + .exit_latency = 166, +- .target_residency = 500, ++ .target_residency = 2000, + .enter = &intel_idle, + .enter_s2idle = intel_idle_s2idle, }, + { +@@ -566,7 +566,7 @@ static struct cpuidle_state bdw_cstates[] = { + .desc = "MWAIT 0x40", + .flags = MWAIT2flg(0x40) | CPUIDLE_FLAG_TLB_FLUSHED, + .exit_latency = 300, +- .target_residency = 900, ++ .target_residency = 4000, + .enter = &intel_idle, + .enter_s2idle = intel_idle_s2idle, }, + { +@@ -574,7 +574,7 @@ static struct cpuidle_state bdw_cstates[] = { + .desc = "MWAIT 0x50", + .flags = MWAIT2flg(0x50) | CPUIDLE_FLAG_TLB_FLUSHED, + .exit_latency = 600, +- .target_residency = 1800, ++ .target_residency = 7000, + .enter = &intel_idle, + .enter_s2idle = intel_idle_s2idle, }, + { +@@ -582,7 +582,7 @@ static struct cpuidle_state bdw_cstates[] = { + .desc = "MWAIT 0x60", + .flags = MWAIT2flg(0x60) | CPUIDLE_FLAG_TLB_FLUSHED, + .exit_latency = 2600, +- .target_residency = 7700, ++ .target_residency = 9000, + .enter = &intel_idle, + .enter_s2idle = intel_idle_s2idle, }, + { +@@ -603,7 +603,7 @@ static struct cpuidle_state skl_cstates[] = { + .desc = "MWAIT 0x01", + .flags = MWAIT2flg(0x01), + .exit_latency = 10, +- .target_residency = 20, ++ .target_residency = 120, + .enter = &intel_idle, + .enter_s2idle = intel_idle_s2idle, }, + { +@@ -611,7 +611,7 @@ static struct cpuidle_state skl_cstates[] = { + .desc = "MWAIT 0x10", + .flags = MWAIT2flg(0x10) | CPUIDLE_FLAG_TLB_FLUSHED, + .exit_latency = 70, +- .target_residency = 100, ++ .target_residency = 1000, + .enter = &intel_idle, + .enter_s2idle = intel_idle_s2idle, }, + { +@@ -619,7 +619,7 @@ static struct cpuidle_state skl_cstates[] = { + .desc = "MWAIT 0x20", + .flags = MWAIT2flg(0x20) | CPUIDLE_FLAG_TLB_FLUSHED, + .exit_latency = 85, +- .target_residency = 200, ++ .target_residency = 600, + .enter = &intel_idle, + .enter_s2idle = intel_idle_s2idle, }, + { +@@ -627,7 +627,7 @@ static struct cpuidle_state skl_cstates[] = { + .desc = "MWAIT 0x33", + .flags = MWAIT2flg(0x33) | CPUIDLE_FLAG_TLB_FLUSHED, + .exit_latency = 124, +- .target_residency = 800, ++ .target_residency = 3000, + .enter = &intel_idle, + .enter_s2idle = intel_idle_s2idle, }, + { +@@ -635,7 +635,7 @@ static struct cpuidle_state skl_cstates[] = { + .desc = "MWAIT 0x40", + .flags = MWAIT2flg(0x40) | CPUIDLE_FLAG_TLB_FLUSHED, + .exit_latency = 200, +- .target_residency = 800, ++ .target_residency = 3200, + .enter = &intel_idle, + .enter_s2idle = intel_idle_s2idle, }, + { +@@ -643,7 +643,7 @@ static struct cpuidle_state skl_cstates[] = { + .desc = "MWAIT 0x50", + .flags = MWAIT2flg(0x50) | CPUIDLE_FLAG_TLB_FLUSHED, + .exit_latency = 480, +- .target_residency = 5000, ++ .target_residency = 9000, + .enter = &intel_idle, + .enter_s2idle = intel_idle_s2idle, }, + { +@@ -651,7 +651,7 @@ static struct cpuidle_state skl_cstates[] = { + .desc = "MWAIT 0x60", + .flags = MWAIT2flg(0x60) | CPUIDLE_FLAG_TLB_FLUSHED, + .exit_latency = 890, +- .target_residency = 5000, ++ .target_residency = 9000, + .enter = &intel_idle, + .enter_s2idle = intel_idle_s2idle, }, + { +@@ -672,7 +672,7 @@ static struct cpuidle_state skx_cstates[] = { + .desc = "MWAIT 0x01", + .flags = MWAIT2flg(0x01), + .exit_latency = 10, +- .target_residency = 20, ++ .target_residency = 300, + .enter = &intel_idle, + .enter_s2idle = intel_idle_s2idle, }, + { +-- +2.20.1 + +From b8211d4f79dd88dfc2d4bd52be46103ea0b70e3e Mon Sep 17 00:00:00 2001 +From: Arjan van de Ven +Date: Fri, 6 Jan 2017 15:34:09 +0000 +Subject: [PATCH] ipv4/tcp: allow the memory tuning for tcp to go a little + bigger than default + +--- + net/ipv4/tcp.c | 4 ++-- + 1 file changed, 2 insertions(+), 2 deletions(-) + +diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c +index cf3c5095c10e..b30d51837b2d 100644 +--- a/net/ipv4/tcp.c ++++ b/net/ipv4/tcp.c +@@ -3897,8 +3897,8 @@ void __init tcp_init(void) + tcp_init_mem(); + /* Set per-socket limits to no more than 1/128 the pressure threshold */ + limit = nr_free_buffer_pages() << (PAGE_SHIFT - 7); +- max_wshare = min(4UL*1024*1024, limit); +- max_rshare = min(6UL*1024*1024, limit); ++ max_wshare = min(16UL*1024*1024, limit); ++ max_rshare = min(16UL*1024*1024, limit); + + init_net.ipv4.sysctl_tcp_wmem[0] = SK_MEM_QUANTUM; + init_net.ipv4.sysctl_tcp_wmem[1] = 16*1024; +-- +2.20.1 + +From 050223869257b87e22636158a80da38d877248ed Mon Sep 17 00:00:00 2001 +From: Arjan van de Ven +Date: Sun, 18 Feb 2018 23:35:41 +0000 +Subject: [PATCH] locking: rwsem: spin faster + +tweak rwsem owner spinning a bit +--- + kernel/locking/rwsem.c | 4 +++- + 1 file changed, 3 insertions(+), 1 deletion(-) + +diff --git a/kernel/locking/rwsem.c b/kernel/locking/rwsem.c +index eef04551eae7..1ec5ab4c8ff7 100644 +--- a/kernel/locking/rwsem.c ++++ b/kernel/locking/rwsem.c +@@ -720,6 +720,7 @@ rwsem_spin_on_owner(struct rw_semaphore *sem, unsigned long nonspinnable) + struct task_struct *new, *owner; + unsigned long flags, new_flags; + enum owner_state state; ++ int i = 0; + + owner = rwsem_owner_flags(sem, &flags); + state = rwsem_owner_state(owner, flags, nonspinnable); +@@ -753,7 +754,8 @@ rwsem_spin_on_owner(struct rw_semaphore *sem, unsigned long nonspinnable) + break; + } + +- cpu_relax(); ++ if (i++ > 1000) ++ cpu_relax(); + } + rcu_read_unlock(); + +From b836ea320114643d4354b43acb6ec8bb06ada487 Mon Sep 17 00:00:00 2001 +From: Arjan van de Ven +Date: Thu, 2 Jun 2016 23:36:32 -0500 +Subject: [PATCH] drivers: Initialize ata before graphics + +ATA init is the long pole in the boot process, and its asynchronous. +move the graphics init after it so that ata and graphics initialize +in parallel +--- + drivers/Makefile | 15 ++++++++------- + 1 file changed, 8 insertions(+), 7 deletions(-) + +diff --git a/drivers/Makefile b/drivers/Makefile +index aaef17cc6512..d08f3a394929 100644 +--- a/drivers/Makefile ++++ b/drivers/Makefile +@@ -58,15 +58,8 @@ obj-y += char/ + # iommu/ comes before gpu as gpu are using iommu controllers + obj-y += iommu/ + +-# gpu/ comes after char for AGP vs DRM startup and after iommu +-obj-y += gpu/ +- + obj-$(CONFIG_CONNECTOR) += connector/ + +-# i810fb and intelfb depend on char/agp/ +-obj-$(CONFIG_FB_I810) += video/fbdev/i810/ +-obj-$(CONFIG_FB_INTEL) += video/fbdev/intelfb/ +- + obj-$(CONFIG_PARPORT) += parport/ + obj-$(CONFIG_NVM) += lightnvm/ + obj-y += base/ block/ misc/ mfd/ nfc/ +@@ -79,6 +72,14 @@ obj-$(CONFIG_IDE) += ide/ + obj-y += scsi/ + obj-y += nvme/ + obj-$(CONFIG_ATA) += ata/ ++ ++# gpu/ comes after char for AGP vs DRM startup and after iommu ++obj-y += gpu/ ++ ++# i810fb and intelfb depend on char/agp/ ++obj-$(CONFIG_FB_I810) += video/fbdev/i810/ ++obj-$(CONFIG_FB_INTEL) += video/fbdev/intelfb/ ++ + obj-$(CONFIG_TARGET_CORE) += target/ + obj-$(CONFIG_MTD) += mtd/ + obj-$(CONFIG_SPI) += spi/ diff --git a/linux55-tkg/linux55-tkg-patches/0003-glitched-base.patch b/linux55-tkg/linux55-tkg-patches/0003-glitched-base.patch new file mode 100644 index 0000000..371e60f --- /dev/null +++ b/linux55-tkg/linux55-tkg-patches/0003-glitched-base.patch @@ -0,0 +1,3965 @@ +From f7f49141a5dbe9c99d78196b58c44307fb2e6be3 Mon Sep 17 00:00:00 2001 +From: Tk-Glitch +Date: Wed, 4 Jul 2018 04:30:08 +0200 +Subject: glitched + +diff --git a/scripts/mkcompile_h b/scripts/mkcompile_h +index 87f1fc9..b3be470 100755 +--- a/scripts/mkcompile_h ++++ b/scripts/mkcompile_h +@@ -50,8 +50,8 @@ else + fi + + UTS_VERSION="#$VERSION" +-CONFIG_FLAGS="" +-if [ -n "$SMP" ] ; then CONFIG_FLAGS="SMP"; fi ++CONFIG_FLAGS="TKG" ++if [ -n "$SMP" ] ; then CONFIG_FLAGS="$CONFIG_FLAGS SMP"; fi + if [ -n "$PREEMPT" ] ; then CONFIG_FLAGS="$CONFIG_FLAGS PREEMPT"; fi + UTS_VERSION="$UTS_VERSION $CONFIG_FLAGS $TIMESTAMP" + +diff --git a/arch/x86/Kconfig.cpu b/arch/x86/Kconfig.cpu +index af9c967782f6..bf07a8c0f495 100644 +--- a/arch/x86/Kconfig.cpu ++++ b/arch/x86/Kconfig.cpu +@@ -123,6 +123,7 @@ config MPENTIUMM + config MPENTIUM4 + bool "Pentium-4/Celeron(P4-based)/Pentium-4 M/older Xeon" + depends on X86_32 ++ select X86_P6_NOP + ---help--- + Select this for Intel Pentium 4 chips. This includes the + Pentium 4, Pentium D, P4-based Celeron and Xeon, and +@@ -155,9 +156,8 @@ config MPENTIUM4 + -Paxville + -Dempsey + +- + config MK6 +- bool "K6/K6-II/K6-III" ++ bool "AMD K6/K6-II/K6-III" + depends on X86_32 + ---help--- + Select this for an AMD K6-family processor. Enables use of +@@ -165,7 +165,7 @@ config MK6 + flags to GCC. + + config MK7 +- bool "Athlon/Duron/K7" ++ bool "AMD Athlon/Duron/K7" + depends on X86_32 + ---help--- + Select this for an AMD Athlon K7-family processor. Enables use of +@@ -173,12 +173,90 @@ config MK7 + flags to GCC. + + config MK8 +- bool "Opteron/Athlon64/Hammer/K8" ++ bool "AMD Opteron/Athlon64/Hammer/K8" + ---help--- + Select this for an AMD Opteron or Athlon64 Hammer-family processor. + Enables use of some extended instructions, and passes appropriate + optimization flags to GCC. + ++config MK8SSE3 ++ bool "AMD Opteron/Athlon64/Hammer/K8 with SSE3" ++ ---help--- ++ Select this for improved AMD Opteron or Athlon64 Hammer-family processors. ++ Enables use of some extended instructions, and passes appropriate ++ optimization flags to GCC. ++ ++config MK10 ++ bool "AMD 61xx/7x50/PhenomX3/X4/II/K10" ++ ---help--- ++ Select this for an AMD 61xx Eight-Core Magny-Cours, Athlon X2 7x50, ++ Phenom X3/X4/II, Athlon II X2/X3/X4, or Turion II-family processor. ++ Enables use of some extended instructions, and passes appropriate ++ optimization flags to GCC. ++ ++config MBARCELONA ++ bool "AMD Barcelona" ++ ---help--- ++ Select this for AMD Family 10h Barcelona processors. ++ ++ Enables -march=barcelona ++ ++config MBOBCAT ++ bool "AMD Bobcat" ++ ---help--- ++ Select this for AMD Family 14h Bobcat processors. ++ ++ Enables -march=btver1 ++ ++config MJAGUAR ++ bool "AMD Jaguar" ++ ---help--- ++ Select this for AMD Family 16h Jaguar processors. ++ ++ Enables -march=btver2 ++ ++config MBULLDOZER ++ bool "AMD Bulldozer" ++ ---help--- ++ Select this for AMD Family 15h Bulldozer processors. ++ ++ Enables -march=bdver1 ++ ++config MPILEDRIVER ++ bool "AMD Piledriver" ++ ---help--- ++ Select this for AMD Family 15h Piledriver processors. ++ ++ Enables -march=bdver2 ++ ++config MSTEAMROLLER ++ bool "AMD Steamroller" ++ ---help--- ++ Select this for AMD Family 15h Steamroller processors. ++ ++ Enables -march=bdver3 ++ ++config MEXCAVATOR ++ bool "AMD Excavator" ++ ---help--- ++ Select this for AMD Family 15h Excavator processors. ++ ++ Enables -march=bdver4 ++ ++config MZEN ++ bool "AMD Zen" ++ ---help--- ++ Select this for AMD Family 17h Zen processors. ++ ++ Enables -march=znver1 ++ ++config MZEN2 ++ bool "AMD Zen 2" ++ ---help--- ++ Select this for AMD Family 17h Zen 2 processors. ++ ++ Enables -march=znver2 ++ + config MCRUSOE + bool "Crusoe" + depends on X86_32 +@@ -260,6 +338,7 @@ config MVIAC7 + + config MPSC + bool "Intel P4 / older Netburst based Xeon" ++ select X86_P6_NOP + depends on X86_64 + ---help--- + Optimize for Intel Pentium 4, Pentium D and older Nocona/Dempsey +@@ -269,8 +348,19 @@ config MPSC + using the cpu family field + in /proc/cpuinfo. Family 15 is an older Xeon, Family 6 a newer one. + ++config MATOM ++ bool "Intel Atom" ++ select X86_P6_NOP ++ ---help--- ++ ++ Select this for the Intel Atom platform. Intel Atom CPUs have an ++ in-order pipelining architecture and thus can benefit from ++ accordingly optimized code. Use a recent GCC with specific Atom ++ support in order to fully benefit from selecting this option. ++ + config MCORE2 +- bool "Core 2/newer Xeon" ++ bool "Intel Core 2" ++ select X86_P6_NOP + ---help--- + + Select this for Intel Core 2 and newer Core 2 Xeons (Xeon 51xx and +@@ -278,14 +368,133 @@ config MCORE2 + family in /proc/cpuinfo. Newer ones have 6 and older ones 15 + (not a typo) + +-config MATOM +- bool "Intel Atom" ++ Enables -march=core2 ++ ++config MNEHALEM ++ bool "Intel Nehalem" ++ select X86_P6_NOP + ---help--- + +- Select this for the Intel Atom platform. Intel Atom CPUs have an +- in-order pipelining architecture and thus can benefit from +- accordingly optimized code. Use a recent GCC with specific Atom +- support in order to fully benefit from selecting this option. ++ Select this for 1st Gen Core processors in the Nehalem family. ++ ++ Enables -march=nehalem ++ ++config MWESTMERE ++ bool "Intel Westmere" ++ select X86_P6_NOP ++ ---help--- ++ ++ Select this for the Intel Westmere formerly Nehalem-C family. ++ ++ Enables -march=westmere ++ ++config MSILVERMONT ++ bool "Intel Silvermont" ++ select X86_P6_NOP ++ ---help--- ++ ++ Select this for the Intel Silvermont platform. ++ ++ Enables -march=silvermont ++ ++config MGOLDMONT ++ bool "Intel Goldmont" ++ select X86_P6_NOP ++ ---help--- ++ ++ Select this for the Intel Goldmont platform including Apollo Lake and Denverton. ++ ++ Enables -march=goldmont ++ ++config MGOLDMONTPLUS ++ bool "Intel Goldmont Plus" ++ select X86_P6_NOP ++ ---help--- ++ ++ Select this for the Intel Goldmont Plus platform including Gemini Lake. ++ ++ Enables -march=goldmont-plus ++ ++config MSANDYBRIDGE ++ bool "Intel Sandy Bridge" ++ select X86_P6_NOP ++ ---help--- ++ ++ Select this for 2nd Gen Core processors in the Sandy Bridge family. ++ ++ Enables -march=sandybridge ++ ++config MIVYBRIDGE ++ bool "Intel Ivy Bridge" ++ select X86_P6_NOP ++ ---help--- ++ ++ Select this for 3rd Gen Core processors in the Ivy Bridge family. ++ ++ Enables -march=ivybridge ++ ++config MHASWELL ++ bool "Intel Haswell" ++ select X86_P6_NOP ++ ---help--- ++ ++ Select this for 4th Gen Core processors in the Haswell family. ++ ++ Enables -march=haswell ++ ++config MBROADWELL ++ bool "Intel Broadwell" ++ select X86_P6_NOP ++ ---help--- ++ ++ Select this for 5th Gen Core processors in the Broadwell family. ++ ++ Enables -march=broadwell ++ ++config MSKYLAKE ++ bool "Intel Skylake" ++ select X86_P6_NOP ++ ---help--- ++ ++ Select this for 6th Gen Core processors in the Skylake family. ++ ++ Enables -march=skylake ++ ++config MSKYLAKEX ++ bool "Intel Skylake X" ++ select X86_P6_NOP ++ ---help--- ++ ++ Select this for 6th Gen Core processors in the Skylake X family. ++ ++ Enables -march=skylake-avx512 ++ ++config MCANNONLAKE ++ bool "Intel Cannon Lake" ++ select X86_P6_NOP ++ ---help--- ++ ++ Select this for 8th Gen Core processors ++ ++ Enables -march=cannonlake ++ ++config MICELAKE ++ bool "Intel Ice Lake" ++ select X86_P6_NOP ++ ---help--- ++ ++ Select this for 10th Gen Core processors in the Ice Lake family. ++ ++ Enables -march=icelake-client ++ ++config MCASCADELAKE ++ bool "Intel Cascade Lake" ++ select X86_P6_NOP ++ ---help--- ++ ++ Select this for Xeon processors in the Cascade Lake family. ++ ++ Enables -march=cascadelake + + config GENERIC_CPU + bool "Generic-x86-64" +@@ -294,6 +503,19 @@ config GENERIC_CPU + Generic x86-64 CPU. + Run equally well on all x86-64 CPUs. + ++config MNATIVE ++ bool "Native optimizations autodetected by GCC" ++ ---help--- ++ ++ GCC 4.2 and above support -march=native, which automatically detects ++ the optimum settings to use based on your processor. -march=native ++ also detects and applies additional settings beyond -march specific ++ to your CPU, (eg. -msse4). Unless you have a specific reason not to ++ (e.g. distcc cross-compiling), you should probably be using ++ -march=native rather than anything listed below. ++ ++ Enables -march=native ++ + endchoice + + config X86_GENERIC +@@ -318,7 +540,7 @@ config X86_INTERNODE_CACHE_SHIFT + config X86_L1_CACHE_SHIFT + int + default "7" if MPENTIUM4 || MPSC +- default "6" if MK7 || MK8 || MPENTIUMM || MCORE2 || MATOM || MVIAC7 || X86_GENERIC || GENERIC_CPU ++ default "6" if MK7 || MK8 || MK8SSE3 || MK10 || MBARCELONA || MBOBCAT || MBULLDOZER || MPILEDRIVER || MSTEAMROLLER || MEXCAVATOR || MZEN || MZEN2 || MJAGUAR || MPENTIUMM || MCORE2 || MNEHALEM || MWESTMERE || MSILVERMONT || MGOLDMONT || MGOLDMONTPLUS || MSANDYBRIDGE || MIVYBRIDGE || MHASWELL || MBROADWELL || MSKYLAKE || MSKYLAKEX || MCANNONLAKE || MICELAKE || MCASCADELAKE || MNATIVE || MATOM || MVIAC7 || X86_GENERIC || GENERIC_CPU + default "4" if MELAN || M486SX || M486 || MGEODEGX1 + default "5" if MWINCHIP3D || MWINCHIPC6 || MCRUSOE || MEFFICEON || MCYRIXIII || MK6 || MPENTIUMIII || MPENTIUMII || M686 || M586MMX || M586TSC || M586 || MVIAC3_2 || MGEODE_LX + +@@ -336,35 +558,36 @@ config X86_ALIGNMENT_16 + + config X86_INTEL_USERCOPY + def_bool y +- depends on MPENTIUM4 || MPENTIUMM || MPENTIUMIII || MPENTIUMII || M586MMX || X86_GENERIC || MK8 || MK7 || MEFFICEON || MCORE2 ++ depends on MPENTIUM4 || MPENTIUMM || MPENTIUMIII || MPENTIUMII || M586MMX || X86_GENERIC || MK8 || MK8SSE3 || MK7 || MEFFICEON || MCORE2 || MK10 || MBARCELONA || MNEHALEM || MWESTMERE || MSILVERMONT || MGOLDMONT || MGOLDMONTPLUS || MSANDYBRIDGE || MIVYBRIDGE || MHASWELL || MBROADWELL || MSKYLAKE || MSKYLAKEX || MCANNONLAKE || MICELAKE || MCASCADELAKE || MNATIVE + + config X86_USE_PPRO_CHECKSUM + def_bool y +- depends on MWINCHIP3D || MWINCHIPC6 || MCYRIXIII || MK7 || MK6 || MPENTIUM4 || MPENTIUMM || MPENTIUMIII || MPENTIUMII || M686 || MK8 || MVIAC3_2 || MVIAC7 || MEFFICEON || MGEODE_LX || MCORE2 || MATOM ++ depends on MWINCHIP3D || MWINCHIPC6 || MCYRIXIII || MK7 || MK6 || MK10 || MPENTIUM4 || MPENTIUMM || MPENTIUMIII || MPENTIUMII || M686 || MK8 || MK8SSE3 || MVIAC3_2 || MVIAC7 || MEFFICEON || MGEODE_LX || MCORE2 || MNEHALEM || MWESTMERE || MSILVERMONT || MGOLDMONT || MGOLDMONTPLUS || MSANDYBRIDGE || MIVYBRIDGE || MHASWELL || MBROADWELL || MSKYLAKE || MSKYLAKEX || MCANNONLAKE || MICELAKE || MCASCADELAKE || MATOM || MNATIVE + + config X86_USE_3DNOW + def_bool y + depends on (MCYRIXIII || MK7 || MGEODE_LX) && !UML + +-# +-# P6_NOPs are a relatively minor optimization that require a family >= +-# 6 processor, except that it is broken on certain VIA chips. +-# Furthermore, AMD chips prefer a totally different sequence of NOPs +-# (which work on all CPUs). In addition, it looks like Virtual PC +-# does not understand them. +-# +-# As a result, disallow these if we're not compiling for X86_64 (these +-# NOPs do work on all x86-64 capable chips); the list of processors in +-# the right-hand clause are the cores that benefit from this optimization. +-# + config X86_P6_NOP +- def_bool y +- depends on X86_64 +- depends on (MCORE2 || MPENTIUM4 || MPSC) ++ default n ++ bool "Support for P6_NOPs on Intel chips" ++ depends on (MCORE2 || MPENTIUM4 || MPSC || MATOM || MNEHALEM || MWESTMERE || MSILVERMONT || MGOLDMONT || MGOLDMONTPLUS || MSANDYBRIDGE || MIVYBRIDGE || MHASWELL || MBROADWELL || MSKYLAKE || MSKYLAKEX || MCANNONLAKE || MICELAKE || MCASCADELAKE || MNATIVE) ++ ---help--- ++ P6_NOPs are a relatively minor optimization that require a family >= ++ 6 processor, except that it is broken on certain VIA chips. ++ Furthermore, AMD chips prefer a totally different sequence of NOPs ++ (which work on all CPUs). In addition, it looks like Virtual PC ++ does not understand them. ++ ++ As a result, disallow these if we're not compiling for X86_64 (these ++ NOPs do work on all x86-64 capable chips); the list of processors in ++ the right-hand clause are the cores that benefit from this optimization. ++ ++ Say Y if you have Intel CPU newer than Pentium Pro, N otherwise. + + config X86_TSC + def_bool y +- depends on (MWINCHIP3D || MCRUSOE || MEFFICEON || MCYRIXIII || MK7 || MK6 || MPENTIUM4 || MPENTIUMM || MPENTIUMIII || MPENTIUMII || M686 || M586MMX || M586TSC || MK8 || MVIAC3_2 || MVIAC7 || MGEODEGX1 || MGEODE_LX || MCORE2 || MATOM) || X86_64 ++ depends on (MWINCHIP3D || MCRUSOE || MEFFICEON || MCYRIXIII || MK7 || MK6 || MPENTIUM4 || MPENTIUMM || MPENTIUMIII || MPENTIUMII || M686 || M586MMX || M586TSC || MK8 || MK8SSE3 || MVIAC3_2 || MVIAC7 || MGEODEGX1 || MGEODE_LX || MCORE2 || MNEHALEM || MWESTMERE || MSILVERMONT || MGOLDMONT || MGOLDMONTPLUS || MSANDYBRIDGE || MIVYBRIDGE || MHASWELL || MBROADWELL || MSKYLAKE || MSKYLAKEX || MCANNONLAKE || MICELAKE || MCASCADELAKE || MNATIVE || MATOM) || X86_64 + + config X86_CMPXCHG64 + def_bool y +@@ -374,7 +597,7 @@ config X86_CMPXCHG64 + # generates cmov. + config X86_CMOV + def_bool y +- depends on (MK8 || MK7 || MCORE2 || MPENTIUM4 || MPENTIUMM || MPENTIUMIII || MPENTIUMII || M686 || MVIAC3_2 || MVIAC7 || MCRUSOE || MEFFICEON || X86_64 || MATOM || MGEODE_LX) ++ depends on (MK8 || MK8SSE3 || MK10 || MBARCELONA || MBOBCAT || MBULLDOZER || MPILEDRIVER || MSTEAMROLLER || MEXCAVATOR || MZEN || MZEN2 || MJAGUAR || MK7 || MCORE2 || MNEHALEM || MWESTMERE || MSILVERMONT || MGOLDMONT || MGOLDMONTPLUS || MSANDYBRIDGE || MIVYBRIDGE || MHASWELL || MBROADWELL || MSKYLAKE || MSKYLAKEX || MCANNONLAKE || MICELAKE || MCASCADELAKE || MPENTIUM4 || MPENTIUMM || MPENTIUMIII || MPENTIUMII || M686 || MVIAC3_2 || MVIAC7 || MCRUSOE || MEFFICEON || X86_64 || MNATIVE || MATOM || MGEODE_LX) + + config X86_MINIMUM_CPU_FAMILY + int +diff --git a/arch/x86/Makefile b/arch/x86/Makefile +index 94df0868804b..dcbed7e3a070 100644 +--- a/arch/x86/Makefile ++++ b/arch/x86/Makefile +@@ -119,13 +119,53 @@ else + KBUILD_CFLAGS += $(call cc-option,-mskip-rax-setup) + + # FIXME - should be integrated in Makefile.cpu (Makefile_32.cpu) ++ cflags-$(CONFIG_MNATIVE) += $(call cc-option,-march=native) + cflags-$(CONFIG_MK8) += $(call cc-option,-march=k8) ++ cflags-$(CONFIG_MK8SSE3) += $(call cc-option,-march=k8-sse3,-mtune=k8) ++ cflags-$(CONFIG_MK10) += $(call cc-option,-march=amdfam10) ++ cflags-$(CONFIG_MBARCELONA) += $(call cc-option,-march=barcelona) ++ cflags-$(CONFIG_MBOBCAT) += $(call cc-option,-march=btver1) ++ cflags-$(CONFIG_MJAGUAR) += $(call cc-option,-march=btver2) ++ cflags-$(CONFIG_MBULLDOZER) += $(call cc-option,-march=bdver1) ++ cflags-$(CONFIG_MPILEDRIVER) += $(call cc-option,-march=bdver2) ++ cflags-$(CONFIG_MSTEAMROLLER) += $(call cc-option,-march=bdver3) ++ cflags-$(CONFIG_MEXCAVATOR) += $(call cc-option,-march=bdver4) ++ cflags-$(CONFIG_MZEN) += $(call cc-option,-march=znver1) ++ cflags-$(CONFIG_MZEN2) += $(call cc-option,-march=znver2) + cflags-$(CONFIG_MPSC) += $(call cc-option,-march=nocona) + + cflags-$(CONFIG_MCORE2) += \ +- $(call cc-option,-march=core2,$(call cc-option,-mtune=generic)) +- cflags-$(CONFIG_MATOM) += $(call cc-option,-march=atom) \ +- $(call cc-option,-mtune=atom,$(call cc-option,-mtune=generic)) ++ $(call cc-option,-march=core2,$(call cc-option,-mtune=core2)) ++ cflags-$(CONFIG_MNEHALEM) += \ ++ $(call cc-option,-march=nehalem,$(call cc-option,-mtune=nehalem)) ++ cflags-$(CONFIG_MWESTMERE) += \ ++ $(call cc-option,-march=westmere,$(call cc-option,-mtune=westmere)) ++ cflags-$(CONFIG_MSILVERMONT) += \ ++ $(call cc-option,-march=silvermont,$(call cc-option,-mtune=silvermont)) ++ cflags-$(CONFIG_MGOLDMONT) += \ ++ $(call cc-option,-march=goldmont,$(call cc-option,-mtune=goldmont)) ++ cflags-$(CONFIG_MGOLDMONTPLUS) += \ ++ $(call cc-option,-march=goldmont-plus,$(call cc-option,-mtune=goldmont-plus)) ++ cflags-$(CONFIG_MSANDYBRIDGE) += \ ++ $(call cc-option,-march=sandybridge,$(call cc-option,-mtune=sandybridge)) ++ cflags-$(CONFIG_MIVYBRIDGE) += \ ++ $(call cc-option,-march=ivybridge,$(call cc-option,-mtune=ivybridge)) ++ cflags-$(CONFIG_MHASWELL) += \ ++ $(call cc-option,-march=haswell,$(call cc-option,-mtune=haswell)) ++ cflags-$(CONFIG_MBROADWELL) += \ ++ $(call cc-option,-march=broadwell,$(call cc-option,-mtune=broadwell)) ++ cflags-$(CONFIG_MSKYLAKE) += \ ++ $(call cc-option,-march=skylake,$(call cc-option,-mtune=skylake)) ++ cflags-$(CONFIG_MSKYLAKEX) += \ ++ $(call cc-option,-march=skylake-avx512,$(call cc-option,-mtune=skylake-avx512)) ++ cflags-$(CONFIG_MCANNONLAKE) += \ ++ $(call cc-option,-march=cannonlake,$(call cc-option,-mtune=cannonlake)) ++ cflags-$(CONFIG_MICELAKE) += \ ++ $(call cc-option,-march=icelake-client,$(call cc-option,-mtune=icelake-client)) ++ cflags-$(CONFIG_MCASCADELAKE) += \ ++ $(call cc-option,-march=cascadelake,$(call cc-option,-mtune=cascadelake)) ++ cflags-$(CONFIG_MATOM) += $(call cc-option,-march=bonnell) \ ++ $(call cc-option,-mtune=bonnell,$(call cc-option,-mtune=generic)) + cflags-$(CONFIG_GENERIC_CPU) += $(call cc-option,-mtune=generic) + KBUILD_CFLAGS += $(cflags-y) + +diff --git a/arch/x86/Makefile_32.cpu b/arch/x86/Makefile_32.cpu +index cd3056759880..2c81838df533 100644 +--- a/arch/x86/Makefile_32.cpu ++++ b/arch/x86/Makefile_32.cpu +@@ -24,7 +24,19 @@ cflags-$(CONFIG_MK6) += -march=k6 + # Please note, that patches that add -march=athlon-xp and friends are pointless. + # They make zero difference whatsosever to performance at this time. + cflags-$(CONFIG_MK7) += -march=athlon ++cflags-$(CONFIG_MNATIVE) += $(call cc-option,-march=native) + cflags-$(CONFIG_MK8) += $(call cc-option,-march=k8,-march=athlon) ++cflags-$(CONFIG_MK8SSE3) += $(call cc-option,-march=k8-sse3,-march=athlon) ++cflags-$(CONFIG_MK10) += $(call cc-option,-march=amdfam10,-march=athlon) ++cflags-$(CONFIG_MBARCELONA) += $(call cc-option,-march=barcelona,-march=athlon) ++cflags-$(CONFIG_MBOBCAT) += $(call cc-option,-march=btver1,-march=athlon) ++cflags-$(CONFIG_MJAGUAR) += $(call cc-option,-march=btver2,-march=athlon) ++cflags-$(CONFIG_MBULLDOZER) += $(call cc-option,-march=bdver1,-march=athlon) ++cflags-$(CONFIG_MPILEDRIVER) += $(call cc-option,-march=bdver2,-march=athlon) ++cflags-$(CONFIG_MSTEAMROLLER) += $(call cc-option,-march=bdver3,-march=athlon) ++cflags-$(CONFIG_MEXCAVATOR) += $(call cc-option,-march=bdver4,-march=athlon) ++cflags-$(CONFIG_MZEN) += $(call cc-option,-march=znver1,-march=athlon) ++cflags-$(CONFIG_MZEN2) += $(call cc-option,-march=znver2,-march=athlon) + cflags-$(CONFIG_MCRUSOE) += -march=i686 -falign-functions=0 -falign-jumps=0 -falign-loops=0 + cflags-$(CONFIG_MEFFICEON) += -march=i686 $(call tune,pentium3) -falign-functions=0 -falign-jumps=0 -falign-loops=0 + cflags-$(CONFIG_MWINCHIPC6) += $(call cc-option,-march=winchip-c6,-march=i586) +@@ -33,8 +45,22 @@ cflags-$(CONFIG_MCYRIXIII) += $(call cc-option,-march=c3,-march=i486) -falign-fu + cflags-$(CONFIG_MVIAC3_2) += $(call cc-option,-march=c3-2,-march=i686) + cflags-$(CONFIG_MVIAC7) += -march=i686 + cflags-$(CONFIG_MCORE2) += -march=i686 $(call tune,core2) +-cflags-$(CONFIG_MATOM) += $(call cc-option,-march=atom,$(call cc-option,-march=core2,-march=i686)) \ +- $(call cc-option,-mtune=atom,$(call cc-option,-mtune=generic)) ++cflags-$(CONFIG_MNEHALEM) += -march=i686 $(call tune,nehalem) ++cflags-$(CONFIG_MWESTMERE) += -march=i686 $(call tune,westmere) ++cflags-$(CONFIG_MSILVERMONT) += -march=i686 $(call tune,silvermont) ++cflags-$(CONFIG_MGOLDMONT) += -march=i686 $(call tune,goldmont) ++cflags-$(CONFIG_MGOLDMONTPLUS) += -march=i686 $(call tune,goldmont-plus) ++cflags-$(CONFIG_MSANDYBRIDGE) += -march=i686 $(call tune,sandybridge) ++cflags-$(CONFIG_MIVYBRIDGE) += -march=i686 $(call tune,ivybridge) ++cflags-$(CONFIG_MHASWELL) += -march=i686 $(call tune,haswell) ++cflags-$(CONFIG_MBROADWELL) += -march=i686 $(call tune,broadwell) ++cflags-$(CONFIG_MSKYLAKE) += -march=i686 $(call tune,skylake) ++cflags-$(CONFIG_MSKYLAKEX) += -march=i686 $(call tune,skylake-avx512) ++cflags-$(CONFIG_MCANNONLAKE) += -march=i686 $(call tune,cannonlake) ++cflags-$(CONFIG_MICELAKE) += -march=i686 $(call tune,icelake-client) ++cflags-$(CONFIG_MCASCADELAKE) += -march=i686 $(call tune,cascadelake) ++cflags-$(CONFIG_MATOM) += $(call cc-option,-march=bonnell,$(call cc-option,-march=core2,-march=i686)) \ ++ $(call cc-option,-mtune=bonnell,$(call cc-option,-mtune=generic)) + + # AMD Elan support + cflags-$(CONFIG_MELAN) += -march=i486 +diff --git a/arch/x86/include/asm/module.h b/arch/x86/include/asm/module.h +index c215d2762488..a4fddfe3d4fb 100644 +--- a/arch/x86/include/asm/module.h ++++ b/arch/x86/include/asm/module.h +@@ -27,6 +27,36 @@ struct mod_arch_specific { + #define MODULE_PROC_FAMILY "586MMX " + #elif defined CONFIG_MCORE2 + #define MODULE_PROC_FAMILY "CORE2 " ++#elif defined CONFIG_MNATIVE ++#define MODULE_PROC_FAMILY "NATIVE " ++#elif defined CONFIG_MNEHALEM ++#define MODULE_PROC_FAMILY "NEHALEM " ++#elif defined CONFIG_MWESTMERE ++#define MODULE_PROC_FAMILY "WESTMERE " ++#elif defined CONFIG_MSILVERMONT ++#define MODULE_PROC_FAMILY "SILVERMONT " ++#elif defined CONFIG_MGOLDMONT ++#define MODULE_PROC_FAMILY "GOLDMONT " ++#elif defined CONFIG_MGOLDMONTPLUS ++#define MODULE_PROC_FAMILY "GOLDMONTPLUS " ++#elif defined CONFIG_MSANDYBRIDGE ++#define MODULE_PROC_FAMILY "SANDYBRIDGE " ++#elif defined CONFIG_MIVYBRIDGE ++#define MODULE_PROC_FAMILY "IVYBRIDGE " ++#elif defined CONFIG_MHASWELL ++#define MODULE_PROC_FAMILY "HASWELL " ++#elif defined CONFIG_MBROADWELL ++#define MODULE_PROC_FAMILY "BROADWELL " ++#elif defined CONFIG_MSKYLAKE ++#define MODULE_PROC_FAMILY "SKYLAKE " ++#elif defined CONFIG_MSKYLAKEX ++#define MODULE_PROC_FAMILY "SKYLAKEX " ++#elif defined CONFIG_MCANNONLAKE ++#define MODULE_PROC_FAMILY "CANNONLAKE " ++#elif defined CONFIG_MICELAKE ++#define MODULE_PROC_FAMILY "ICELAKE " ++#elif defined CONFIG_MCASCADELAKE ++#define MODULE_PROC_FAMILY "CASCADELAKE " + #elif defined CONFIG_MATOM + #define MODULE_PROC_FAMILY "ATOM " + #elif defined CONFIG_M686 +@@ -45,6 +75,28 @@ struct mod_arch_specific { + #define MODULE_PROC_FAMILY "K7 " + #elif defined CONFIG_MK8 + #define MODULE_PROC_FAMILY "K8 " ++#elif defined CONFIG_MK8SSE3 ++#define MODULE_PROC_FAMILY "K8SSE3 " ++#elif defined CONFIG_MK10 ++#define MODULE_PROC_FAMILY "K10 " ++#elif defined CONFIG_MBARCELONA ++#define MODULE_PROC_FAMILY "BARCELONA " ++#elif defined CONFIG_MBOBCAT ++#define MODULE_PROC_FAMILY "BOBCAT " ++#elif defined CONFIG_MBULLDOZER ++#define MODULE_PROC_FAMILY "BULLDOZER " ++#elif defined CONFIG_MPILEDRIVER ++#define MODULE_PROC_FAMILY "PILEDRIVER " ++#elif defined CONFIG_MSTEAMROLLER ++#define MODULE_PROC_FAMILY "STEAMROLLER " ++#elif defined CONFIG_MJAGUAR ++#define MODULE_PROC_FAMILY "JAGUAR " ++#elif defined CONFIG_MEXCAVATOR ++#define MODULE_PROC_FAMILY "EXCAVATOR " ++#elif defined CONFIG_MZEN ++#define MODULE_PROC_FAMILY "ZEN " ++#elif defined CONFIG_MZEN2 ++#define MODULE_PROC_FAMILY "ZEN2 " + #elif defined CONFIG_MELAN + #define MODULE_PROC_FAMILY "ELAN " + #elif defined CONFIG_MCRUSOE +diff --git a/fs/dcache.c b/fs/dcache.c +index 2acfc69878f5..3f1131431e06 100644 +--- a/fs/dcache.c ++++ b/fs/dcache.c +@@ -69,7 +69,7 @@ + * If no ancestor relationship: + * arbitrary, since it's serialized on rename_lock + */ +-int sysctl_vfs_cache_pressure __read_mostly = 100; ++int sysctl_vfs_cache_pressure __read_mostly = 50; + EXPORT_SYMBOL_GPL(sysctl_vfs_cache_pressure); + + __cacheline_aligned_in_smp DEFINE_SEQLOCK(rename_lock); +diff --git a/kernel/sched/core.c b/kernel/sched/core.c +index 211890edf37e..37121563407d 100644 +--- a/kernel/sched/core.c ++++ b/kernel/sched/core.c +@@ -41,7 +41,7 @@ const_debug unsigned int sysctl_sched_features = + * Number of tasks to iterate in a single balance run. + * Limited because this is done with IRQs disabled. + */ +-const_debug unsigned int sysctl_sched_nr_migrate = 32; ++const_debug unsigned int sysctl_sched_nr_migrate = 128; + + /* + * period over which we average the RT time consumption, measured +@@ -61,9 +61,9 @@ __read_mostly int scheduler_running; + + /* + * part of the period that we allow rt tasks to run in us. +- * default: 0.95s ++ * XanMod default: 0.98s + */ +-int sysctl_sched_rt_runtime = 950000; ++int sysctl_sched_rt_runtime = 980000; + + /* + * __task_rq_lock - lock the rq @p resides on. +diff --git a/lib/Kconfig b/lib/Kconfig +index 5fe577673b98..c44c27cd6e05 100644 +--- a/lib/Kconfig ++++ b/lib/Kconfig +@@ -10,6 +10,16 @@ menu "Library routines" + config RAID6_PQ + tristate + ++config RAID6_USE_PREFER_GEN ++ bool "Use prefered raid6 gen function." ++ default n ++ depends on RAID6_PQ ++ help ++ This option is provided for using prefered raid6 gen function ++ directly instead of calculating the best durning boot-up. ++ The prefered function should be the same as the best one from ++ calculating. ++ + config BITREVERSE + tristate + +diff --git a/lib/raid6/algos.c b/lib/raid6/algos.c +index 5065b1e7e327..1bf3c712a4ca 100644 +--- a/lib/raid6/algos.c ++++ b/lib/raid6/algos.c +@@ -150,6 +150,29 @@ static inline const struct raid6_recov_calls *raid6_choose_recov(void) + return best; + } + ++#ifdef CONFIG_RAID6_USE_PREFER_GEN ++static inline const struct raid6_calls *raid6_choose_prefer_gen(void) ++{ ++ const struct raid6_calls *const *algo; ++ const struct raid6_calls *best; ++ ++ for (best = NULL, algo = raid6_algos; *algo; algo++) { ++ if (!best || (*algo)->prefer >= best->prefer) { ++ if ((*algo)->valid && !(*algo)->valid()) ++ continue; ++ best = *algo; ++ } ++ } ++ ++ if (best) { ++ printk("raid6: using algorithm %s\n", best->name); ++ raid6_call = *best; ++ } else ++ printk("raid6: Yikes! No algorithm found!\n"); ++ ++ return best; ++} ++#else + static inline const struct raid6_calls *raid6_choose_gen( + void *(*const dptrs)[(65536/PAGE_SIZE)+2], const int disks) + { +@@ -221,6 +244,7 @@ static inline const struct raid6_calls *raid6_choose_gen( + + return best; + } ++#endif + + + /* Try to pick the best algorithm */ +@@ -228,10 +252,11 @@ static inline const struct raid6_calls *raid6_choose_gen( + + int __init raid6_select_algo(void) + { +- const int disks = (65536/PAGE_SIZE)+2; +- + const struct raid6_calls *gen_best; + const struct raid6_recov_calls *rec_best; ++#ifndef CONFIG_RAID6_USE_PREFER_GEN ++ const int disks = (65536/PAGE_SIZE)+2; ++ + char *syndromes; + void *dptrs[(65536/PAGE_SIZE)+2]; + int i; +@@ -252,11 +277,16 @@ int __init raid6_select_algo(void) + + /* select raid gen_syndrome function */ + gen_best = raid6_choose_gen(&dptrs, disks); ++#else ++ gen_best = raid6_choose_prefer_gen(); ++#endif + + /* select raid recover functions */ + rec_best = raid6_choose_recov(); + ++#ifndef CONFIG_RAID6_USE_PREFER_GEN + free_pages((unsigned long)syndromes, 1); ++#endif + + return gen_best && rec_best ? 0 : -EINVAL; + } +diff --git a/mm/zswap.c b/mm/zswap.c +index 61a5c41972db..2674c2806130 100644 +--- a/mm/zswap.c ++++ b/mm/zswap.c +@@ -91,7 +91,7 @@ static struct kernel_param_ops zswap_enabled_param_ops = { + module_param_cb(enabled, &zswap_enabled_param_ops, &zswap_enabled, 0644); + + /* Crypto compressor to use */ +-#define ZSWAP_COMPRESSOR_DEFAULT "lzo" ++#define ZSWAP_COMPRESSOR_DEFAULT "lz4" + static char *zswap_compressor = ZSWAP_COMPRESSOR_DEFAULT; + static int zswap_compressor_param_set(const char *, + const struct kernel_param *); +diff --git a/scripts/setlocalversion b/scripts/setlocalversion +index 71f39410691b..288f9679e883 100755 +--- a/scripts/setlocalversion ++++ b/scripts/setlocalversion +@@ -54,7 +54,7 @@ scm_version() + # If only the short version is requested, don't bother + # running further git commands + if $short; then +- echo "+" ++ # echo "+" + return + fi + # If we are past a tagged commit (like + +From f85ed068b4d0e6c31edce8574a95757a60e58b87 Mon Sep 17 00:00:00 2001 +From: Etienne Juvigny +Date: Mon, 3 Sep 2018 17:36:25 +0200 +Subject: Zenify & stuff + + +diff --git a/Documentation/tp_smapi.txt b/Documentation/tp_smapi.txt +new file mode 100644 +index 000000000000..a249678a8866 +--- /dev/null ++++ b/Documentation/tp_smapi.txt +@@ -0,0 +1,275 @@ ++tp_smapi version 0.42 ++IBM ThinkPad hardware functions driver ++ ++Author: Shem Multinymous ++Project: http://sourceforge.net/projects/tpctl ++Wiki: http://thinkwiki.org/wiki/tp_smapi ++List: linux-thinkpad@linux-thinkpad.org ++ (http://mailman.linux-thinkpad.org/mailman/listinfo/linux-thinkpad) ++ ++Description ++----------- ++ ++ThinkPad laptops include a proprietary interface called SMAPI BIOS ++(System Management Application Program Interface) which provides some ++hardware control functionality that is not accessible by other means. ++ ++This driver exposes some features of the SMAPI BIOS through a sysfs ++interface. It is suitable for newer models, on which SMAPI is invoked ++through IO port writes. Older models use a different SMAPI interface; ++for those, try the "thinkpad" module from the "tpctl" package. ++ ++WARNING: ++This driver uses undocumented features and direct hardware access. ++It thus cannot be guaranteed to work, and may cause arbitrary damage ++(especially on models it wasn't tested on). ++ ++ ++Module parameters ++----------------- ++ ++thinkpad_ec module: ++ force_io=1 lets thinkpad_ec load on some recent ThinkPad models ++ (e.g., T400 and T500) whose BIOS's ACPI DSDT reserves the ports we need. ++tp_smapi module: ++ debug=1 enables verbose dmesg output. ++ ++ ++Usage ++----- ++ ++Control of battery charging thresholds (in percents of current full charge ++capacity): ++ ++# echo 40 > /sys/devices/platform/smapi/BAT0/start_charge_thresh ++# echo 70 > /sys/devices/platform/smapi/BAT0/stop_charge_thresh ++# cat /sys/devices/platform/smapi/BAT0/*_charge_thresh ++ ++ (This is useful since Li-Ion batteries wear out much faster at very ++ high or low charge levels. The driver will also keeps the thresholds ++ across suspend-to-disk with AC disconnected; this isn't done ++ automatically by the hardware.) ++ ++Inhibiting battery charging for 17 minutes (overrides thresholds): ++ ++# echo 17 > /sys/devices/platform/smapi/BAT0/inhibit_charge_minutes ++# echo 0 > /sys/devices/platform/smapi/BAT0/inhibit_charge_minutes # stop ++# cat /sys/devices/platform/smapi/BAT0/inhibit_charge_minutes ++ ++ (This can be used to control which battery is charged when using an ++ Ultrabay battery.) ++ ++Forcing battery discharging even if AC power available: ++ ++# echo 1 > /sys/devices/platform/smapi/BAT0/force_discharge # start discharge ++# echo 0 > /sys/devices/platform/smapi/BAT0/force_discharge # stop discharge ++# cat /sys/devices/platform/smapi/BAT0/force_discharge ++ ++ (When AC is connected, forced discharging will automatically stop ++ when battery is fully depleted -- this is useful for calibration. ++ Also, this attribute can be used to control which battery is discharged ++ when both a system battery and an Ultrabay battery are connected.) ++ ++Misc read-only battery status attributes (see note about HDAPS below): ++ ++/sys/devices/platform/smapi/BAT0/installed # 0 or 1 ++/sys/devices/platform/smapi/BAT0/state # idle/charging/discharging ++/sys/devices/platform/smapi/BAT0/cycle_count # integer counter ++/sys/devices/platform/smapi/BAT0/current_now # instantaneous current ++/sys/devices/platform/smapi/BAT0/current_avg # last minute average ++/sys/devices/platform/smapi/BAT0/power_now # instantaneous power ++/sys/devices/platform/smapi/BAT0/power_avg # last minute average ++/sys/devices/platform/smapi/BAT0/last_full_capacity # in mWh ++/sys/devices/platform/smapi/BAT0/remaining_percent # remaining percent of energy (set by calibration) ++/sys/devices/platform/smapi/BAT0/remaining_percent_error # error range of remaing_percent (not reset by calibration) ++/sys/devices/platform/smapi/BAT0/remaining_running_time # in minutes, by last minute average power ++/sys/devices/platform/smapi/BAT0/remaining_running_time_now # in minutes, by instantenous power ++/sys/devices/platform/smapi/BAT0/remaining_charging_time # in minutes ++/sys/devices/platform/smapi/BAT0/remaining_capacity # in mWh ++/sys/devices/platform/smapi/BAT0/design_capacity # in mWh ++/sys/devices/platform/smapi/BAT0/voltage # in mV ++/sys/devices/platform/smapi/BAT0/design_voltage # in mV ++/sys/devices/platform/smapi/BAT0/charging_max_current # max charging current ++/sys/devices/platform/smapi/BAT0/charging_max_voltage # max charging voltage ++/sys/devices/platform/smapi/BAT0/group{0,1,2,3}_voltage # see below ++/sys/devices/platform/smapi/BAT0/manufacturer # string ++/sys/devices/platform/smapi/BAT0/model # string ++/sys/devices/platform/smapi/BAT0/barcoding # string ++/sys/devices/platform/smapi/BAT0/chemistry # string ++/sys/devices/platform/smapi/BAT0/serial # integer ++/sys/devices/platform/smapi/BAT0/manufacture_date # YYYY-MM-DD ++/sys/devices/platform/smapi/BAT0/first_use_date # YYYY-MM-DD ++/sys/devices/platform/smapi/BAT0/temperature # in milli-Celsius ++/sys/devices/platform/smapi/BAT0/dump # see below ++/sys/devices/platform/smapi/ac_connected # 0 or 1 ++ ++The BAT0/group{0,1,2,3}_voltage attribute refers to the separate cell groups ++in each battery. For example, on the ThinkPad 600, X3x, T4x and R5x models, ++the battery contains 3 cell groups in series, where each group consisting of 2 ++or 3 cells connected in parallel. The voltage of each group is given by these ++attributes, and their sum (roughly) equals the "voltage" attribute. ++(The effective performance of the battery is determined by the weakest group, ++i.e., the one those voltage changes most rapidly during dis/charging.) ++ ++The "BAT0/dump" attribute gives a a hex dump of the raw status data, which ++contains additional data now in the above (if you can figure it out). Some ++unused values are autodetected and replaced by "--": ++ ++In all of the above, replace BAT0 with BAT1 to address the 2nd battery (e.g. ++in the UltraBay). ++ ++ ++Raw SMAPI calls: ++ ++/sys/devices/platform/smapi/smapi_request ++This performs raw SMAPI calls. It uses a bad interface that cannot handle ++multiple simultaneous access. Don't touch it, it's for development only. ++If you did touch it, you would so something like ++# echo '211a 100 0 0' > /sys/devices/platform/smapi/smapi_request ++# cat /sys/devices/platform/smapi/smapi_request ++and notice that in the output "211a 34b b2 0 0 0 'OK'", the "4b" in the 2nd ++value, converted to decimal is 75: the current charge stop threshold. ++ ++ ++Model-specific status ++--------------------- ++ ++Works (at least partially) on the following ThinkPad model: ++* A30 ++* G41 ++* R40, R50p, R51, R52 ++* T23, T40, T40p, T41, T41p, T42, T42p, T43, T43p, T60, T61, T400, T410, T420 (partially) ++* X24, X31, X32, X40, X41, X60, X61, X200, X201, X220 (partially) ++* Z60t, Z61m ++ ++Does not work on: ++* X230 and newer ++* T430 and newer ++* Any ThinkPad Edge ++* Any ThinkPad Yoga ++* Any ThinkPad L series ++* Any ThinkPad P series ++ ++Not all functions are available on all models; for detailed status, see: ++ http://thinkwiki.org/wiki/tp_smapi ++ ++Please report success/failure by e-mail or on the Wiki. ++If you get a "not implemented" or "not supported" message, your laptop ++probably just can't do that (at least not via the SMAPI BIOS). ++For negative reports, follow the bug reporting guidelines below. ++If you send me the necessary technical data (i.e., SMAPI function ++interfaces), I will support additional models. ++ ++ ++Additional HDAPS features ++------------------------- ++ ++The modified hdaps driver has several improvements on the one in mainline ++(beyond resolving the conflict with thinkpad_ec and tp_smapi): ++ ++- Fixes reliability and improves support for recent ThinkPad models ++ (especially *60 and newer). Unlike the mainline driver, the modified hdaps ++ correctly follows the Embedded Controller communication protocol. ++ ++- Extends the "invert" parameter to cover all possible axis orientations. ++ The possible values are as follows. ++ Let X,Y denote the hardware readouts. ++ Let R denote the laptop's roll (tilt left/right). ++ Let P denote the laptop's pitch (tilt forward/backward). ++ invert=0: R= X P= Y (same as mainline) ++ invert=1: R=-X P=-Y (same as mainline) ++ invert=2: R=-X P= Y (new) ++ invert=3: R= X P=-Y (new) ++ invert=4: R= Y P= X (new) ++ invert=5: R=-Y P=-X (new) ++ invert=6: R=-Y P= X (new) ++ invert=7: R= Y P=-X (new) ++ It's probably easiest to just try all 8 possibilities and see which yields ++ correct results (e.g., in the hdaps-gl visualisation). ++ ++- Adds a whitelist which automatically sets the correct axis orientation for ++ some models. If the value for your model is wrong or missing, you can override ++ it using the "invert" parameter. Please also update the tables at ++ http://www.thinkwiki.org/wiki/tp_smapi and ++ http://www.thinkwiki.org/wiki/List_of_DMI_IDs ++ and submit a patch for the whitelist in hdaps.c. ++ ++- Provides new attributes: ++ /sys/devices/platform/hdaps/sampling_rate: ++ This determines the frequency at which the host queries the embedded ++ controller for accelerometer data (and informs the hdaps input devices). ++ Default=50. ++ /sys/devices/platform/hdaps/oversampling_ratio: ++ When set to X, the embedded controller is told to do physical accelerometer ++ measurements at a rate that is X times higher than the rate at which ++ the driver reads those measurements (i.e., X*sampling_rate). This ++ makes the readouts from the embedded controller more fresh, and is also ++ useful for the running average filter (see next). Default=5 ++ /sys/devices/platform/hdaps/running_avg_filter_order: ++ When set to X, reported readouts will be the average of the last X physical ++ accelerometer measurements. Current firmware allows 1<=X<=8. Setting to a ++ high value decreases readout fluctuations. The averaging is handled by the ++ embedded controller, so no CPU resources are used. Higher values make the ++ readouts smoother, since it averages out both sensor noise (good) and abrupt ++ changes (bad). Default=2. ++ ++- Provides a second input device, which publishes the raw accelerometer ++ measurements (without the fuzzing needed for joystick emulation). This input ++ device can be matched by a udev rule such as the following (all on one line): ++ KERNEL=="event[0-9]*", ATTRS{phys}=="hdaps/input1", ++ ATTRS{modalias}=="input:b0019v1014p5054e4801-*", ++ SYMLINK+="input/hdaps/accelerometer-event ++ ++A new version of the hdapsd userspace daemon, which uses the input device ++interface instead of polling sysfs, is available seprately. Using this reduces ++the total interrupts per second generated by hdaps+hdapsd (on tickless kernels) ++to 50, down from a value that fluctuates between 50 and 100. Set the ++sampling_rate sysfs attribute to a lower value to further reduce interrupts, ++at the expense of response latency. ++ ++Licensing note: all my changes to the HDAPS driver are licensed under the ++GPL version 2 or, at your option and to the extent allowed by derivation from ++prior works, any later version. My version of hdaps is derived work from the ++mainline version, which at the time of writing is available only under ++GPL version 2. ++ ++Bug reporting ++------------- ++ ++Mail . Please include: ++* Details about your model, ++* Relevant "dmesg" output. Make sure thinkpad_ec and tp_smapi are loaded with ++ the "debug=1" parameter (e.g., use "make load HDAPS=1 DEBUG=1"). ++* Output of "dmidecode | grep -C5 Product" ++* Does the failed functionality works under Windows? ++ ++ ++More about SMAPI ++---------------- ++ ++For hints about what may be possible via the SMAPI BIOS and how, see: ++ ++* IBM Technical Reference Manual for the ThinkPad 770 ++ (http://www-307.ibm.com/pc/support/site.wss/document.do?lndocid=PFAN-3TUQQD) ++* Exported symbols in PWRMGRIF.DLL or TPPWRW32.DLL (e.g., use "objdump -x"). ++* drivers/char/mwave/smapi.c in the Linux kernel tree.* ++* The "thinkpad" SMAPI module (http://tpctl.sourceforge.net). ++* The SMAPI_* constants in tp_smapi.c. ++ ++Note that in the above Technical Reference and in the "thinkpad" module, ++SMAPI is invoked through a function call to some physical address. However, ++the interface used by tp_smapi and the above mwave drive, and apparently ++required by newer ThinkPad, is different: you set the parameters up in the ++CPU's registers and write to ports 0xB2 (the APM control port) and 0x4F; this ++triggers an SMI (System Management Interrupt), causing the CPU to enter ++SMM (System Management Mode) and run the BIOS firmware; the results are ++returned in the CPU's registers. It is not clear what is the relation between ++the two variants of SMAPI, though the assignment of error codes seems to be ++similar. ++ ++In addition, the embedded controller on ThinkPad laptops has a non-standard ++interface at IO ports 0x1600-0x161F (mapped to LCP channel 3 of the H8S chip). ++The interface provides various system management services (currently known: ++battery information and accelerometer readouts). For more information see the ++thinkpad_ec module and the H8S hardware documentation: ++http://documentation.renesas.com/eng/products/mpumcu/rej09b0300_2140bhm.pdf +diff --git a/init/Kconfig b/init/Kconfig +index b4daad2bac23..c1e59dc04209 100644 +--- a/init/Kconfig ++++ b/init/Kconfig +@@ -1244,7 +1244,6 @@ config CC_OPTIMIZE_FOR_PERFORMANCE + + config CC_OPTIMIZE_FOR_PERFORMANCE_O3 + bool "Optimize more for performance (-O3)" +- depends on ARC + imply CC_DISABLE_WARN_MAYBE_UNINITIALIZED # avoid false positives + help + Choosing this option will pass "-O3" to your compiler to optimize +diff --git a/drivers/infiniband/core/addr.c b/drivers/infiniband/core/addr.c +index 4f32c4062fb6..c0bf039e1b40 100644 +--- a/drivers/infiniband/core/addr.c ++++ b/drivers/infiniband/core/addr.c +@@ -721,6 +721,7 @@ int rdma_addr_find_l2_eth_by_grh(const union ib_gid *sgid, + struct sockaddr _sockaddr; + struct sockaddr_in _sockaddr_in; + struct sockaddr_in6 _sockaddr_in6; ++ struct sockaddr_ib _sockaddr_ib; + } sgid_addr, dgid_addr; + int ret; + +diff --git a/drivers/input/mouse/synaptics.c b/drivers/input/mouse/synaptics.c +index 55d33500d55e..744e84228a1f 100644 +--- a/drivers/input/mouse/synaptics.c ++++ b/drivers/input/mouse/synaptics.c +@@ -1338,7 +1338,9 @@ static int set_input_params(struct psmouse *psmouse, + if (psmouse_matches_pnp_id(psmouse, topbuttonpad_pnp_ids) && + !SYN_CAP_EXT_BUTTONS_STICK(info->ext_cap_10)) + __set_bit(INPUT_PROP_TOPBUTTONPAD, dev->propbit); +- } ++ } else if (SYN_CAP_CLICKPAD2BTN(info->ext_cap_0c) || ++ SYN_CAP_CLICKPAD2BTN2(info->ext_cap_0c)) ++ __set_bit(INPUT_PROP_BUTTONPAD, dev->propbit); + + return 0; + } +diff --git a/drivers/input/mouse/synaptics.h b/drivers/input/mouse/synaptics.h +index fc00e005c611..4cfbeec3ae4c 100644 +--- a/drivers/input/mouse/synaptics.h ++++ b/drivers/input/mouse/synaptics.h +@@ -86,6 +86,7 @@ + */ + #define SYN_CAP_CLICKPAD(ex0c) ((ex0c) & BIT(20)) /* 1-button ClickPad */ + #define SYN_CAP_CLICKPAD2BTN(ex0c) ((ex0c) & BIT(8)) /* 2-button ClickPad */ ++#define SYN_CAP_CLICKPAD2BTN2(ex0c) ((ex0c) & BIT(21)) /* 2-button ClickPad */ + #define SYN_CAP_MAX_DIMENSIONS(ex0c) ((ex0c) & BIT(17)) + #define SYN_CAP_MIN_DIMENSIONS(ex0c) ((ex0c) & BIT(13)) + #define SYN_CAP_ADV_GESTURE(ex0c) ((ex0c) & BIT(19)) +diff --git a/drivers/macintosh/Kconfig b/drivers/macintosh/Kconfig +index 97a420c11eed..c8621e9b2e4a 100644 +--- a/drivers/macintosh/Kconfig ++++ b/drivers/macintosh/Kconfig +@@ -159,6 +159,13 @@ config INPUT_ADBHID + + If unsure, say Y. + ++config ADB_TRACKPAD_ABSOLUTE ++ bool "Enable absolute mode for adb trackpads" ++ depends on INPUT_ADBHID ++ help ++ Enable absolute mode in adb-base trackpads. This feature adds ++ compatibility with synaptics Xorg / Xfree drivers. ++ + config MAC_EMUMOUSEBTN + tristate "Support for mouse button 2+3 emulation" + depends on SYSCTL && INPUT +diff --git a/drivers/macintosh/adbhid.c b/drivers/macintosh/adbhid.c +index a261892c03b3..a85192de840c 100644 +--- a/drivers/macintosh/adbhid.c ++++ b/drivers/macintosh/adbhid.c +@@ -262,6 +262,15 @@ static struct adb_ids buttons_ids; + #define ADBMOUSE_MS_A3 8 /* Mouse systems A3 trackball (handler 3) */ + #define ADBMOUSE_MACALLY2 9 /* MacAlly 2-button mouse */ + ++#ifdef CONFIG_ADB_TRACKPAD_ABSOLUTE ++#define ABS_XMIN 310 ++#define ABS_XMAX 1700 ++#define ABS_YMIN 200 ++#define ABS_YMAX 1000 ++#define ABS_ZMIN 0 ++#define ABS_ZMAX 55 ++#endif ++ + static void + adbhid_keyboard_input(unsigned char *data, int nb, int apoll) + { +@@ -405,6 +414,9 @@ static void + adbhid_mouse_input(unsigned char *data, int nb, int autopoll) + { + int id = (data[0] >> 4) & 0x0f; ++#ifdef CONFIG_ADB_TRACKPAD_ABSOLUTE ++ int btn = 0; int x_axis = 0; int y_axis = 0; int z_axis = 0; ++#endif + + if (!adbhid[id]) { + pr_err("ADB HID on ID %d not yet registered\n", id); +@@ -436,6 +448,17 @@ adbhid_mouse_input(unsigned char *data, int nb, int autopoll) + high bits of y-axis motion. XY is additional + high bits of x-axis motion. + ++ For ADB Absolute motion protocol the data array will contain the ++ following values: ++ ++ BITS COMMENTS ++ data[0] = dddd 1100 ADB command: Talk, register 0, for device dddd. ++ data[1] = byyy yyyy Left button and y-axis motion. ++ data[2] = bxxx xxxx Second button and x-axis motion. ++ data[3] = 1yyy 1xxx Half bits of y-axis and x-axis motion. ++ data[4] = 1yyy 1xxx Higher bits of y-axis and x-axis motion. ++ data[5] = 1zzz 1zzz Higher and lower bits of z-pressure. ++ + MacAlly 2-button mouse protocol. + + For MacAlly 2-button mouse protocol the data array will contain the +@@ -458,8 +481,17 @@ adbhid_mouse_input(unsigned char *data, int nb, int autopoll) + switch (adbhid[id]->mouse_kind) + { + case ADBMOUSE_TRACKPAD: ++#ifdef CONFIG_ADB_TRACKPAD_ABSOLUTE ++ x_axis = (data[2] & 0x7f) | ((data[3] & 0x07) << 7) | ++ ((data[4] & 0x07) << 10); ++ y_axis = (data[1] & 0x7f) | ((data[3] & 0x70) << 3) | ++ ((data[4] & 0x70) << 6); ++ z_axis = (data[5] & 0x07) | ((data[5] & 0x70) >> 1); ++ btn = (!(data[1] >> 7)) & 1; ++#else + data[1] = (data[1] & 0x7f) | ((data[1] & data[2]) & 0x80); + data[2] = data[2] | 0x80; ++#endif + break; + case ADBMOUSE_MICROSPEED: + data[1] = (data[1] & 0x7f) | ((data[3] & 0x01) << 7); +@@ -485,17 +517,39 @@ adbhid_mouse_input(unsigned char *data, int nb, int autopoll) + break; + } + +- input_report_key(adbhid[id]->input, BTN_LEFT, !((data[1] >> 7) & 1)); +- input_report_key(adbhid[id]->input, BTN_MIDDLE, !((data[2] >> 7) & 1)); ++#ifdef CONFIG_ADB_TRACKPAD_ABSOLUTE ++ if ( adbhid[id]->mouse_kind == ADBMOUSE_TRACKPAD ) { + +- if (nb >= 4 && adbhid[id]->mouse_kind != ADBMOUSE_TRACKPAD) +- input_report_key(adbhid[id]->input, BTN_RIGHT, !((data[3] >> 7) & 1)); ++ if(z_axis > 30) input_report_key(adbhid[id]->input, BTN_TOUCH, 1); ++ if(z_axis < 25) input_report_key(adbhid[id]->input, BTN_TOUCH, 0); + +- input_report_rel(adbhid[id]->input, REL_X, +- ((data[2]&0x7f) < 64 ? (data[2]&0x7f) : (data[2]&0x7f)-128 )); +- input_report_rel(adbhid[id]->input, REL_Y, +- ((data[1]&0x7f) < 64 ? (data[1]&0x7f) : (data[1]&0x7f)-128 )); ++ if(z_axis > 0){ ++ input_report_abs(adbhid[id]->input, ABS_X, x_axis); ++ input_report_abs(adbhid[id]->input, ABS_Y, y_axis); ++ input_report_key(adbhid[id]->input, BTN_TOOL_FINGER, 1); ++ input_report_key(adbhid[id]->input, ABS_TOOL_WIDTH, 5); ++ } else { ++ input_report_key(adbhid[id]->input, BTN_TOOL_FINGER, 0); ++ input_report_key(adbhid[id]->input, ABS_TOOL_WIDTH, 0); ++ } ++ ++ input_report_abs(adbhid[id]->input, ABS_PRESSURE, z_axis); ++ input_report_key(adbhid[id]->input, BTN_LEFT, btn); ++ } else { ++#endif ++ input_report_key(adbhid[id]->input, BTN_LEFT, !((data[1] >> 7) & 1)); ++ input_report_key(adbhid[id]->input, BTN_MIDDLE, !((data[2] >> 7) & 1)); ++ ++ if (nb >= 4 && adbhid[id]->mouse_kind != ADBMOUSE_TRACKPAD) ++ input_report_key(adbhid[id]->input, BTN_RIGHT, !((data[3] >> 7) & 1)); + ++ input_report_rel(adbhid[id]->input, REL_X, ++ ((data[2]&0x7f) < 64 ? (data[2]&0x7f) : (data[2]&0x7f)-128 )); ++ input_report_rel(adbhid[id]->input, REL_Y, ++ ((data[1]&0x7f) < 64 ? (data[1]&0x7f) : (data[1]&0x7f)-128 )); ++#ifdef CONFIG_ADB_TRACKPAD_ABSOLUTE ++ } ++#endif + input_sync(adbhid[id]->input); + } + +@@ -849,6 +903,15 @@ adbhid_input_register(int id, int default_id, int original_handler_id, + input_dev->keybit[BIT_WORD(BTN_MOUSE)] = BIT_MASK(BTN_LEFT) | + BIT_MASK(BTN_MIDDLE) | BIT_MASK(BTN_RIGHT); + input_dev->relbit[0] = BIT_MASK(REL_X) | BIT_MASK(REL_Y); ++#ifdef CONFIG_ADB_TRACKPAD_ABSOLUTE ++ set_bit(EV_ABS, input_dev->evbit); ++ input_set_abs_params(input_dev, ABS_X, ABS_XMIN, ABS_XMAX, 0, 0); ++ input_set_abs_params(input_dev, ABS_Y, ABS_YMIN, ABS_YMAX, 0, 0); ++ input_set_abs_params(input_dev, ABS_PRESSURE, ABS_ZMIN, ABS_ZMAX, 0, 0); ++ set_bit(BTN_TOUCH, input_dev->keybit); ++ set_bit(BTN_TOOL_FINGER, input_dev->keybit); ++ set_bit(ABS_TOOL_WIDTH, input_dev->absbit); ++#endif + break; + + case ADB_MISC: +@@ -1132,7 +1195,11 @@ init_trackpad(int id) + r1_buffer[3], + r1_buffer[4], + r1_buffer[5], ++#ifdef CONFIG_ADB_TRACKPAD_ABSOLUTE ++ 0x00, /* Enable absolute mode */ ++#else + 0x03, /*r1_buffer[6],*/ ++#endif + r1_buffer[7]); + + /* Without this flush, the trackpad may be locked up */ +diff --git a/drivers/platform/x86/Kconfig b/drivers/platform/x86/Kconfig +index ac4d48830415..b272132ac742 100644 +--- a/drivers/platform/x86/Kconfig ++++ b/drivers/platform/x86/Kconfig +@@ -573,9 +573,28 @@ config THINKPAD_ACPI_HOTKEY_POLL + If you are not sure, say Y here. The driver enables polling only if + it is strictly necessary to do so. + ++config THINKPAD_EC ++ tristate ++ ---help--- ++ This is a low-level driver for accessing the ThinkPad H8S embedded ++ controller over the LPC bus (not to be confused with the ACPI Embedded ++ Controller interface). ++ ++config TP_SMAPI ++ tristate "ThinkPad SMAPI Support" ++ select THINKPAD_EC ++ default n ++ help ++ This adds SMAPI support on Lenovo/IBM ThinkPads, for features such ++ as battery charging control. For more information about this driver ++ see . ++ ++ If you have a Lenovo/IBM ThinkPad laptop, say Y or M here. ++ + config SENSORS_HDAPS + tristate "Thinkpad Hard Drive Active Protection System (hdaps)" + depends on INPUT ++ select THINKPAD_EC + help + This driver provides support for the IBM Hard Drive Active Protection + System (hdaps), which provides an accelerometer and other misc. data. +diff --git a/drivers/platform/x86/Makefile b/drivers/platform/x86/Makefile +index 2ba6cb795338..399f8b88646f 100644 +--- a/drivers/platform/x86/Makefile ++++ b/drivers/platform/x86/Makefile +@@ -35,6 +35,8 @@ obj-$(CONFIG_TC1100_WMI) += tc1100-wmi.o + obj-$(CONFIG_SONY_LAPTOP) += sony-laptop.o + obj-$(CONFIG_IDEAPAD_LAPTOP) += ideapad-laptop.o + obj-$(CONFIG_THINKPAD_ACPI) += thinkpad_acpi.o ++obj-$(CONFIG_THINKPAD_EC) += thinkpad_ec.o ++obj-$(CONFIG_TP_SMAPI) += tp_smapi.o + obj-$(CONFIG_SENSORS_HDAPS) += hdaps.o + obj-$(CONFIG_FUJITSU_LAPTOP) += fujitsu-laptop.o + obj-$(CONFIG_FUJITSU_TABLET) += fujitsu-tablet.o +diff --git a/drivers/platform/x86/thinkpad_ec.c b/drivers/platform/x86/thinkpad_ec.c +new file mode 100644 +index 000000000000..597614bc17e6 +--- /dev/null ++++ b/drivers/platform/x86/thinkpad_ec.c +@@ -0,0 +1,513 @@ ++/* ++ * thinkpad_ec.c - ThinkPad embedded controller LPC3 functions ++ * ++ * The embedded controller on ThinkPad laptops has a non-standard interface, ++ * where LPC channel 3 of the H8S EC chip is hooked up to IO ports ++ * 0x1600-0x161F and implements (a special case of) the H8S LPC protocol. ++ * The EC LPC interface provides various system management services (currently ++ * known: battery information and accelerometer readouts). This driver ++ * provides access and mutual exclusion for the EC interface. ++* ++ * The LPC protocol and terminology are documented here: ++ * "H8S/2104B Group Hardware Manual", ++ * http://documentation.renesas.com/eng/products/mpumcu/rej09b0300_2140bhm.pdf ++ * ++ * Copyright (C) 2006-2007 Shem Multinymous ++ * ++ * This program is free software; you can redistribute it and/or modify ++ * it under the terms of the GNU General Public License as published by ++ * the Free Software Foundation; either version 2 of the License, or ++ * (at your option) any later version. ++ * ++ * This program is distributed in the hope that it will be useful, ++ * but WITHOUT ANY WARRANTY; without even the implied warranty of ++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ++ * GNU General Public License for more details. ++ * ++ * You should have received a copy of the GNU General Public License ++ * along with this program; if not, write to the Free Software ++ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA ++ */ ++ ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++ ++#include ++#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,26) ++ #include ++#else ++ #include ++#endif ++ ++#define TP_VERSION "0.42" ++ ++MODULE_AUTHOR("Shem Multinymous"); ++MODULE_DESCRIPTION("ThinkPad embedded controller hardware access"); ++MODULE_VERSION(TP_VERSION); ++MODULE_LICENSE("GPL"); ++ ++/* IO ports used by embedded controller LPC channel 3: */ ++#define TPC_BASE_PORT 0x1600 ++#define TPC_NUM_PORTS 0x20 ++#define TPC_STR3_PORT 0x1604 /* Reads H8S EC register STR3 */ ++#define TPC_TWR0_PORT 0x1610 /* Mapped to H8S EC register TWR0MW/SW */ ++#define TPC_TWR15_PORT 0x161F /* Mapped to H8S EC register TWR15. */ ++ /* (and port TPC_TWR0_PORT+i is mapped to H8S reg TWRi for 00x%02x", \ ++ msg, args->val[0x0], args->val[0xF], code) ++ ++/* State of request prefetching: */ ++static u8 prefetch_arg0, prefetch_argF; /* Args of last prefetch */ ++static u64 prefetch_jiffies; /* time of prefetch, or: */ ++#define TPC_PREFETCH_NONE INITIAL_JIFFIES /* No prefetch */ ++#define TPC_PREFETCH_JUNK (INITIAL_JIFFIES+1) /* Ignore prefetch */ ++ ++/* Locking: */ ++#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,37) ++static DECLARE_MUTEX(thinkpad_ec_mutex); ++#else ++static DEFINE_SEMAPHORE(thinkpad_ec_mutex); ++#endif ++ ++/* Kludge in case the ACPI DSDT reserves the ports we need. */ ++static bool force_io; /* Willing to do IO to ports we couldn't reserve? */ ++static int reserved_io; /* Successfully reserved the ports? */ ++module_param_named(force_io, force_io, bool, 0600); ++MODULE_PARM_DESC(force_io, "Force IO even if region already reserved (0=off, 1=on)"); ++ ++/** ++ * thinkpad_ec_lock - get lock on the ThinkPad EC ++ * ++ * Get exclusive lock for accesing the ThinkPad embedded controller LPC3 ++ * interface. Returns 0 iff lock acquired. ++ */ ++int thinkpad_ec_lock(void) ++{ ++ int ret; ++ ret = down_interruptible(&thinkpad_ec_mutex); ++ return ret; ++} ++EXPORT_SYMBOL_GPL(thinkpad_ec_lock); ++ ++/** ++ * thinkpad_ec_try_lock - try getting lock on the ThinkPad EC ++ * ++ * Try getting an exclusive lock for accesing the ThinkPad embedded ++ * controller LPC3. Returns immediately if lock is not available; neither ++ * blocks nor sleeps. Returns 0 iff lock acquired . ++ */ ++int thinkpad_ec_try_lock(void) ++{ ++ return down_trylock(&thinkpad_ec_mutex); ++} ++EXPORT_SYMBOL_GPL(thinkpad_ec_try_lock); ++ ++/** ++ * thinkpad_ec_unlock - release lock on ThinkPad EC ++ * ++ * Release a previously acquired exclusive lock on the ThinkPad ebmedded ++ * controller LPC3 interface. ++ */ ++void thinkpad_ec_unlock(void) ++{ ++ up(&thinkpad_ec_mutex); ++} ++EXPORT_SYMBOL_GPL(thinkpad_ec_unlock); ++ ++/** ++ * thinkpad_ec_request_row - tell embedded controller to prepare a row ++ * @args Input register arguments ++ * ++ * Requests a data row by writing to H8S LPC registers TRW0 through TWR15 (or ++ * a subset thereof) following the protocol prescribed by the "H8S/2104B Group ++ * Hardware Manual". Does sanity checks via status register STR3. ++ */ ++static int thinkpad_ec_request_row(const struct thinkpad_ec_row *args) ++{ ++ u8 str3; ++ int i; ++ ++ /* EC protocol requires write to TWR0 (function code): */ ++ if (!(args->mask & 0x0001)) { ++ printk(KERN_ERR MSG_FMT("bad args->mask=0x%02x", args->mask)); ++ return -EINVAL; ++ } ++ ++ /* Check initial STR3 status: */ ++ str3 = inb(TPC_STR3_PORT) & H8S_STR3_MASK; ++ if (str3 & H8S_STR3_OBF3B) { /* data already pending */ ++ inb(TPC_TWR15_PORT); /* marks end of previous transaction */ ++ if (prefetch_jiffies == TPC_PREFETCH_NONE) ++ printk(KERN_WARNING REQ_FMT( ++ "EC has result from unrequested transaction", ++ str3)); ++ return -EBUSY; /* EC will be ready in a few usecs */ ++ } else if (str3 == H8S_STR3_SWMF) { /* busy with previous request */ ++ if (prefetch_jiffies == TPC_PREFETCH_NONE) ++ printk(KERN_WARNING REQ_FMT( ++ "EC is busy with unrequested transaction", ++ str3)); ++ return -EBUSY; /* data will be pending in a few usecs */ ++ } else if (str3 != 0x00) { /* unexpected status? */ ++ printk(KERN_WARNING REQ_FMT("unexpected initial STR3", str3)); ++ return -EIO; ++ } ++ ++ /* Send TWR0MW: */ ++ outb(args->val[0], TPC_TWR0_PORT); ++ str3 = inb(TPC_STR3_PORT) & H8S_STR3_MASK; ++ if (str3 != H8S_STR3_MWMF) { /* not accepted? */ ++ printk(KERN_WARNING REQ_FMT("arg0 rejected", str3)); ++ return -EIO; ++ } ++ ++ /* Send TWR1 through TWR14: */ ++ for (i = 1; i < TP_CONTROLLER_ROW_LEN-1; i++) ++ if ((args->mask>>i)&1) ++ outb(args->val[i], TPC_TWR0_PORT+i); ++ ++ /* Send TWR15 (default to 0x01). This marks end of command. */ ++ outb((args->mask & 0x8000) ? args->val[0xF] : 0x01, TPC_TWR15_PORT); ++ ++ /* Wait until EC starts writing its reply (~60ns on average). ++ * Releasing locks before this happens may cause an EC hang ++ * due to firmware bug! ++ */ ++ for (i = 0; i < TPC_REQUEST_RETRIES; i++) { ++ str3 = inb(TPC_STR3_PORT) & H8S_STR3_MASK; ++ if (str3 & H8S_STR3_SWMF) /* EC started replying */ ++ return 0; ++ else if (!(str3 & ~(H8S_STR3_IBF3B|H8S_STR3_MWMF))) ++ /* Normal progress (the EC hasn't seen the request ++ * yet, or is processing it). Wait it out. */ ++ ndelay(TPC_REQUEST_NDELAY); ++ else { /* weird EC status */ ++ printk(KERN_WARNING ++ REQ_FMT("bad end STR3", str3)); ++ return -EIO; ++ } ++ } ++ printk(KERN_WARNING REQ_FMT("EC is mysteriously silent", str3)); ++ return -EIO; ++} ++ ++/** ++ * thinkpad_ec_read_data - read pre-requested row-data from EC ++ * @args Input register arguments of pre-requested rows ++ * @data Output register values ++ * ++ * Reads current row data from the controller, assuming it's already ++ * requested. Follows the H8S spec for register access and status checks. ++ */ ++static int thinkpad_ec_read_data(const struct thinkpad_ec_row *args, ++ struct thinkpad_ec_row *data) ++{ ++ int i; ++ u8 str3 = inb(TPC_STR3_PORT) & H8S_STR3_MASK; ++ /* Once we make a request, STR3 assumes the sequence of values listed ++ * in the following 'if' as it reads the request and writes its data. ++ * It takes about a few dozen nanosecs total, with very high variance. ++ */ ++ if (str3 == (H8S_STR3_IBF3B|H8S_STR3_MWMF) || ++ str3 == 0x00 || /* the 0x00 is indistinguishable from idle EC! */ ++ str3 == H8S_STR3_SWMF) ++ return -EBUSY; /* not ready yet */ ++ /* Finally, the EC signals output buffer full: */ ++ if (str3 != (H8S_STR3_OBF3B|H8S_STR3_SWMF)) { ++ printk(KERN_WARNING ++ REQ_FMT("bad initial STR3", str3)); ++ return -EIO; ++ } ++ ++ /* Read first byte (signals start of read transactions): */ ++ data->val[0] = inb(TPC_TWR0_PORT); ++ /* Optionally read 14 more bytes: */ ++ for (i = 1; i < TP_CONTROLLER_ROW_LEN-1; i++) ++ if ((data->mask >> i)&1) ++ data->val[i] = inb(TPC_TWR0_PORT+i); ++ /* Read last byte from 0x161F (signals end of read transaction): */ ++ data->val[0xF] = inb(TPC_TWR15_PORT); ++ ++ /* Readout still pending? */ ++ str3 = inb(TPC_STR3_PORT) & H8S_STR3_MASK; ++ if (str3 & H8S_STR3_OBF3B) ++ printk(KERN_WARNING ++ REQ_FMT("OBF3B=1 after read", str3)); ++ /* If port 0x161F returns 0x80 too often, the EC may lock up. Warn: */ ++ if (data->val[0xF] == 0x80) ++ printk(KERN_WARNING ++ REQ_FMT("0x161F reports error", data->val[0xF])); ++ return 0; ++} ++ ++/** ++ * thinkpad_ec_is_row_fetched - is the given row currently prefetched? ++ * ++ * To keep things simple we compare only the first and last args; ++ * this suffices for all known cases. ++ */ ++static int thinkpad_ec_is_row_fetched(const struct thinkpad_ec_row *args) ++{ ++ return (prefetch_jiffies != TPC_PREFETCH_NONE) && ++ (prefetch_jiffies != TPC_PREFETCH_JUNK) && ++ (prefetch_arg0 == args->val[0]) && ++ (prefetch_argF == args->val[0xF]) && ++ (get_jiffies_64() < prefetch_jiffies + TPC_PREFETCH_TIMEOUT); ++} ++ ++/** ++ * thinkpad_ec_read_row - request and read data from ThinkPad EC ++ * @args Input register arguments ++ * @data Output register values ++ * ++ * Read a data row from the ThinkPad embedded controller LPC3 interface. ++ * Does fetching and retrying if needed. The row is specified by an ++ * array of 16 bytes, some of which may be undefined (but the first is ++ * mandatory). These bytes are given in @args->val[], where @args->val[i] is ++ * used iff (@args->mask>>i)&1). The resulting row data is stored in ++ * @data->val[], but is only guaranteed to be valid for indices corresponding ++ * to set bit in @data->mask. That is, if @data->mask&(1<val[i] is undefined. ++ * ++ * Returns -EBUSY on transient error and -EIO on abnormal condition. ++ * Caller must hold controller lock. ++ */ ++int thinkpad_ec_read_row(const struct thinkpad_ec_row *args, ++ struct thinkpad_ec_row *data) ++{ ++ int retries, ret; ++ ++ if (thinkpad_ec_is_row_fetched(args)) ++ goto read_row; /* already requested */ ++ ++ /* Request the row */ ++ for (retries = 0; retries < TPC_READ_RETRIES; ++retries) { ++ ret = thinkpad_ec_request_row(args); ++ if (!ret) ++ goto read_row; ++ if (ret != -EBUSY) ++ break; ++ ndelay(TPC_READ_NDELAY); ++ } ++ printk(KERN_ERR REQ_FMT("failed requesting row", ret)); ++ goto out; ++ ++read_row: ++ /* Read the row's data */ ++ for (retries = 0; retries < TPC_READ_RETRIES; ++retries) { ++ ret = thinkpad_ec_read_data(args, data); ++ if (!ret) ++ goto out; ++ if (ret != -EBUSY) ++ break; ++ ndelay(TPC_READ_NDELAY); ++ } ++ ++ printk(KERN_ERR REQ_FMT("failed waiting for data", ret)); ++ ++out: ++ prefetch_jiffies = TPC_PREFETCH_JUNK; ++ return ret; ++} ++EXPORT_SYMBOL_GPL(thinkpad_ec_read_row); ++ ++/** ++ * thinkpad_ec_try_read_row - try reading prefetched data from ThinkPad EC ++ * @args Input register arguments ++ * @data Output register values ++ * ++ * Try reading a data row from the ThinkPad embedded controller LPC3 ++ * interface, if this raw was recently prefetched using ++ * thinkpad_ec_prefetch_row(). Does not fetch, retry or block. ++ * The parameters have the same meaning as in thinkpad_ec_read_row(). ++ * ++ * Returns -EBUSY is data not ready and -ENODATA if row not prefetched. ++ * Caller must hold controller lock. ++ */ ++int thinkpad_ec_try_read_row(const struct thinkpad_ec_row *args, ++ struct thinkpad_ec_row *data) ++{ ++ int ret; ++ if (!thinkpad_ec_is_row_fetched(args)) { ++ ret = -ENODATA; ++ } else { ++ ret = thinkpad_ec_read_data(args, data); ++ if (!ret) ++ prefetch_jiffies = TPC_PREFETCH_NONE; /* eaten up */ ++ } ++ return ret; ++} ++EXPORT_SYMBOL_GPL(thinkpad_ec_try_read_row); ++ ++/** ++ * thinkpad_ec_prefetch_row - prefetch data from ThinkPad EC ++ * @args Input register arguments ++ * ++ * Prefetch a data row from the ThinkPad embedded controller LCP3 ++ * interface. A subsequent call to thinkpad_ec_read_row() with the ++ * same arguments will be faster, and a subsequent call to ++ * thinkpad_ec_try_read_row() stands a good chance of succeeding if ++ * done neither too soon nor too late. See ++ * thinkpad_ec_read_row() for the meaning of @args. ++ * ++ * Returns -EBUSY on transient error and -EIO on abnormal condition. ++ * Caller must hold controller lock. ++ */ ++int thinkpad_ec_prefetch_row(const struct thinkpad_ec_row *args) ++{ ++ int ret; ++ ret = thinkpad_ec_request_row(args); ++ if (ret) { ++ prefetch_jiffies = TPC_PREFETCH_JUNK; ++ } else { ++ prefetch_jiffies = get_jiffies_64(); ++ prefetch_arg0 = args->val[0x0]; ++ prefetch_argF = args->val[0xF]; ++ } ++ return ret; ++} ++EXPORT_SYMBOL_GPL(thinkpad_ec_prefetch_row); ++ ++/** ++ * thinkpad_ec_invalidate - invalidate prefetched ThinkPad EC data ++ * ++ * Invalidate the data prefetched via thinkpad_ec_prefetch_row() from the ++ * ThinkPad embedded controller LPC3 interface. ++ * Must be called before unlocking by any code that accesses the controller ++ * ports directly. ++ */ ++void thinkpad_ec_invalidate(void) ++{ ++ prefetch_jiffies = TPC_PREFETCH_JUNK; ++} ++EXPORT_SYMBOL_GPL(thinkpad_ec_invalidate); ++ ++ ++/*** Checking for EC hardware ***/ ++ ++/** ++ * thinkpad_ec_test - verify the EC is present and follows protocol ++ * ++ * Ensure the EC LPC3 channel really works on this machine by making ++ * an EC request and seeing if the EC follows the documented H8S protocol. ++ * The requested row just reads battery status, so it should be harmless to ++ * access it (on a correct EC). ++ * This test writes to IO ports, so execute only after checking DMI. ++ */ ++static int __init thinkpad_ec_test(void) ++{ ++ int ret; ++ const struct thinkpad_ec_row args = /* battery 0 basic status */ ++ { .mask = 0x8001, .val = {0x01,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0x00} }; ++ struct thinkpad_ec_row data = { .mask = 0x0000 }; ++ ret = thinkpad_ec_lock(); ++ if (ret) ++ return ret; ++ ret = thinkpad_ec_read_row(&args, &data); ++ thinkpad_ec_unlock(); ++ return ret; ++} ++ ++/* Search all DMI device names of a given type for a substring */ ++static int __init dmi_find_substring(int type, const char *substr) ++{ ++ const struct dmi_device *dev = NULL; ++ while ((dev = dmi_find_device(type, NULL, dev))) { ++ if (strstr(dev->name, substr)) ++ return 1; ++ } ++ return 0; ++} ++ ++#define TP_DMI_MATCH(vendor,model) { \ ++ .ident = vendor " " model, \ ++ .matches = { \ ++ DMI_MATCH(DMI_BOARD_VENDOR, vendor), \ ++ DMI_MATCH(DMI_PRODUCT_VERSION, model) \ ++ } \ ++} ++ ++/* Check DMI for existence of ThinkPad embedded controller */ ++static int __init check_dmi_for_ec(void) ++{ ++ /* A few old models that have a good EC but don't report it in DMI */ ++ struct dmi_system_id tp_whitelist[] = { ++ TP_DMI_MATCH("IBM", "ThinkPad A30"), ++ TP_DMI_MATCH("IBM", "ThinkPad T23"), ++ TP_DMI_MATCH("IBM", "ThinkPad X24"), ++ TP_DMI_MATCH("LENOVO", "ThinkPad"), ++ { .ident = NULL } ++ }; ++ return dmi_find_substring(DMI_DEV_TYPE_OEM_STRING, ++ "IBM ThinkPad Embedded Controller") || ++ dmi_check_system(tp_whitelist); ++} ++ ++/*** Init and cleanup ***/ ++ ++static int __init thinkpad_ec_init(void) ++{ ++ if (!check_dmi_for_ec()) { ++ printk(KERN_WARNING ++ "thinkpad_ec: no ThinkPad embedded controller!\n"); ++ return -ENODEV; ++ } ++ ++ if (request_region(TPC_BASE_PORT, TPC_NUM_PORTS, "thinkpad_ec")) { ++ reserved_io = 1; ++ } else { ++ printk(KERN_ERR "thinkpad_ec: cannot claim IO ports %#x-%#x... ", ++ TPC_BASE_PORT, ++ TPC_BASE_PORT + TPC_NUM_PORTS - 1); ++ if (force_io) { ++ printk("forcing use of unreserved IO ports.\n"); ++ } else { ++ printk("consider using force_io=1.\n"); ++ return -ENXIO; ++ } ++ } ++ prefetch_jiffies = TPC_PREFETCH_JUNK; ++ if (thinkpad_ec_test()) { ++ printk(KERN_ERR "thinkpad_ec: initial ec test failed\n"); ++ if (reserved_io) ++ release_region(TPC_BASE_PORT, TPC_NUM_PORTS); ++ return -ENXIO; ++ } ++ printk(KERN_INFO "thinkpad_ec: thinkpad_ec " TP_VERSION " loaded.\n"); ++ return 0; ++} ++ ++static void __exit thinkpad_ec_exit(void) ++{ ++ if (reserved_io) ++ release_region(TPC_BASE_PORT, TPC_NUM_PORTS); ++ printk(KERN_INFO "thinkpad_ec: unloaded.\n"); ++} ++ ++module_init(thinkpad_ec_init); ++module_exit(thinkpad_ec_exit); +diff --git a/drivers/platform/x86/tp_smapi.c b/drivers/platform/x86/tp_smapi.c +new file mode 100644 +index 000000000000..209cb6487e24 +--- /dev/null ++++ b/drivers/platform/x86/tp_smapi.c +@@ -0,0 +1,1493 @@ ++/* ++ * tp_smapi.c - ThinkPad SMAPI support ++ * ++ * This driver exposes some features of the System Management Application ++ * Program Interface (SMAPI) BIOS found on ThinkPad laptops. It works on ++ * models in which the SMAPI BIOS runs in SMM and is invoked by writing ++ * to the APM control port 0xB2. ++ * It also exposes battery status information, obtained from the ThinkPad ++ * embedded controller (via the thinkpad_ec module). ++ * Ancient ThinkPad models use a different interface, supported by the ++ * "thinkpad" module from "tpctl". ++ * ++ * Many of the battery status values obtained from the EC simply mirror ++ * values provided by the battery's Smart Battery System (SBS) interface, so ++ * their meaning is defined by the Smart Battery Data Specification (see ++ * http://sbs-forum.org/specs/sbdat110.pdf). References to this SBS spec ++ * are given in the code where relevant. ++ * ++ * Copyright (C) 2006 Shem Multinymous . ++ * SMAPI access code based on the mwave driver by Mike Sullivan. ++ * ++ * This program is free software; you can redistribute it and/or modify ++ * it under the terms of the GNU General Public License as published by ++ * the Free Software Foundation; either version 2 of the License, or ++ * (at your option) any later version. ++ * ++ * This program is distributed in the hope that it will be useful, ++ * but WITHOUT ANY WARRANTY; without even the implied warranty of ++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ++ * GNU General Public License for more details. ++ * ++ * You should have received a copy of the GNU General Public License ++ * along with this program; if not, write to the Free Software ++ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA ++ */ ++ ++#include ++#include ++#include ++#include ++#include ++#include /* CMOS defines */ ++#include ++#include ++#include ++#include ++#include ++#include ++ ++#define TP_VERSION "0.42" ++#define TP_DESC "ThinkPad SMAPI Support" ++#define TP_DIR "smapi" ++ ++MODULE_AUTHOR("Shem Multinymous"); ++MODULE_DESCRIPTION(TP_DESC); ++MODULE_VERSION(TP_VERSION); ++MODULE_LICENSE("GPL"); ++ ++static struct platform_device *pdev; ++ ++static int tp_debug; ++module_param_named(debug, tp_debug, int, 0600); ++MODULE_PARM_DESC(debug, "Debug level (0=off, 1=on)"); ++ ++/* A few macros for printk()ing: */ ++#define TPRINTK(level, fmt, args...) \ ++ dev_printk(level, &(pdev->dev), "%s: " fmt "\n", __func__, ## args) ++#define DPRINTK(fmt, args...) \ ++ do { if (tp_debug) TPRINTK(KERN_DEBUG, fmt, ## args); } while (0) ++ ++/********************************************************************* ++ * SMAPI interface ++ */ ++ ++/* SMAPI functions (register BX when making the SMM call). */ ++#define SMAPI_GET_INHIBIT_CHARGE 0x2114 ++#define SMAPI_SET_INHIBIT_CHARGE 0x2115 ++#define SMAPI_GET_THRESH_START 0x2116 ++#define SMAPI_SET_THRESH_START 0x2117 ++#define SMAPI_GET_FORCE_DISCHARGE 0x2118 ++#define SMAPI_SET_FORCE_DISCHARGE 0x2119 ++#define SMAPI_GET_THRESH_STOP 0x211a ++#define SMAPI_SET_THRESH_STOP 0x211b ++ ++/* SMAPI error codes (see ThinkPad 770 Technical Reference Manual p.83 at ++ http://www-307.ibm.com/pc/support/site.wss/document.do?lndocid=PFAN-3TUQQD */ ++#define SMAPI_RETCODE_EOF 0xff ++static struct { u8 rc; char *msg; int ret; } smapi_retcode[] = ++{ ++ {0x00, "OK", 0}, ++ {0x53, "SMAPI function is not available", -ENXIO}, ++ {0x81, "Invalid parameter", -EINVAL}, ++ {0x86, "Function is not supported by SMAPI BIOS", -EOPNOTSUPP}, ++ {0x90, "System error", -EIO}, ++ {0x91, "System is invalid", -EIO}, ++ {0x92, "System is busy, -EBUSY"}, ++ {0xa0, "Device error (disk read error)", -EIO}, ++ {0xa1, "Device is busy", -EBUSY}, ++ {0xa2, "Device is not attached", -ENXIO}, ++ {0xa3, "Device is disbled", -EIO}, ++ {0xa4, "Request parameter is out of range", -EINVAL}, ++ {0xa5, "Request parameter is not accepted", -EINVAL}, ++ {0xa6, "Transient error", -EBUSY}, /* ? */ ++ {SMAPI_RETCODE_EOF, "Unknown error code", -EIO} ++}; ++ ++ ++#define SMAPI_MAX_RETRIES 10 ++#define SMAPI_PORT2 0x4F /* fixed port, meaning unclear */ ++static unsigned short smapi_port; /* APM control port, normally 0xB2 */ ++ ++#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,37) ++static DECLARE_MUTEX(smapi_mutex); ++#else ++static DEFINE_SEMAPHORE(smapi_mutex); ++#endif ++ ++/** ++ * find_smapi_port - read SMAPI port from NVRAM ++ */ ++static int __init find_smapi_port(void) ++{ ++ u16 smapi_id = 0; ++ unsigned short port = 0; ++ unsigned long flags; ++ ++ spin_lock_irqsave(&rtc_lock, flags); ++ smapi_id = CMOS_READ(0x7C); ++ smapi_id |= (CMOS_READ(0x7D) << 8); ++ spin_unlock_irqrestore(&rtc_lock, flags); ++ ++ if (smapi_id != 0x5349) { ++ printk(KERN_ERR "SMAPI not supported (ID=0x%x)\n", smapi_id); ++ return -ENXIO; ++ } ++ spin_lock_irqsave(&rtc_lock, flags); ++ port = CMOS_READ(0x7E); ++ port |= (CMOS_READ(0x7F) << 8); ++ spin_unlock_irqrestore(&rtc_lock, flags); ++ if (port == 0) { ++ printk(KERN_ERR "unable to read SMAPI port number\n"); ++ return -ENXIO; ++ } ++ return port; ++} ++ ++/** ++ * smapi_request - make a SMAPI call ++ * @inEBX, @inECX, @inEDI, @inESI: input registers ++ * @outEBX, @outECX, @outEDX, @outEDI, @outESI: outputs registers ++ * @msg: textual error message ++ * Invokes the SMAPI SMBIOS with the given input and outpu args. ++ * All outputs are optional (can be %NULL). ++ * Returns 0 when successful, and a negative errno constant ++ * (see smapi_retcode above) upon failure. ++ */ ++static int smapi_request(u32 inEBX, u32 inECX, ++ u32 inEDI, u32 inESI, ++ u32 *outEBX, u32 *outECX, u32 *outEDX, ++ u32 *outEDI, u32 *outESI, const char **msg) ++{ ++ int ret = 0; ++ int i; ++ int retries; ++ u8 rc; ++ /* Must use local vars for output regs, due to reg pressure. */ ++ u32 tmpEAX, tmpEBX, tmpECX, tmpEDX, tmpEDI, tmpESI; ++ ++ for (retries = 0; retries < SMAPI_MAX_RETRIES; ++retries) { ++ DPRINTK("req_in: BX=%x CX=%x DI=%x SI=%x", ++ inEBX, inECX, inEDI, inESI); ++ ++ /* SMAPI's SMBIOS call and thinkpad_ec end up using use ++ * different interfaces to the same chip, so play it safe. */ ++ ret = thinkpad_ec_lock(); ++ if (ret) ++ return ret; ++ ++ __asm__ __volatile__( ++ "movl $0x00005380,%%eax\n\t" ++ "movl %6,%%ebx\n\t" ++ "movl %7,%%ecx\n\t" ++ "movl %8,%%edi\n\t" ++ "movl %9,%%esi\n\t" ++ "xorl %%edx,%%edx\n\t" ++ "movw %10,%%dx\n\t" ++ "out %%al,%%dx\n\t" /* trigger SMI to SMBIOS */ ++ "out %%al,$0x4F\n\t" ++ "movl %%eax,%0\n\t" ++ "movl %%ebx,%1\n\t" ++ "movl %%ecx,%2\n\t" ++ "movl %%edx,%3\n\t" ++ "movl %%edi,%4\n\t" ++ "movl %%esi,%5\n\t" ++ :"=m"(tmpEAX), ++ "=m"(tmpEBX), ++ "=m"(tmpECX), ++ "=m"(tmpEDX), ++ "=m"(tmpEDI), ++ "=m"(tmpESI) ++ :"m"(inEBX), "m"(inECX), "m"(inEDI), "m"(inESI), ++ "m"((u16)smapi_port) ++ :"%eax", "%ebx", "%ecx", "%edx", "%edi", ++ "%esi"); ++ ++ thinkpad_ec_invalidate(); ++ thinkpad_ec_unlock(); ++ ++ /* Don't let the next SMAPI access happen too quickly, ++ * may case problems. (We're hold smapi_mutex). */ ++ msleep(50); ++ ++ if (outEBX) *outEBX = tmpEBX; ++ if (outECX) *outECX = tmpECX; ++ if (outEDX) *outEDX = tmpEDX; ++ if (outESI) *outESI = tmpESI; ++ if (outEDI) *outEDI = tmpEDI; ++ ++ /* Look up error code */ ++ rc = (tmpEAX>>8)&0xFF; ++ for (i = 0; smapi_retcode[i].rc != SMAPI_RETCODE_EOF && ++ smapi_retcode[i].rc != rc; ++i) {} ++ ret = smapi_retcode[i].ret; ++ if (msg) ++ *msg = smapi_retcode[i].msg; ++ ++ DPRINTK("req_out: AX=%x BX=%x CX=%x DX=%x DI=%x SI=%x r=%d", ++ tmpEAX, tmpEBX, tmpECX, tmpEDX, tmpEDI, tmpESI, ret); ++ if (ret) ++ TPRINTK(KERN_NOTICE, "SMAPI error: %s (func=%x)", ++ smapi_retcode[i].msg, inEBX); ++ ++ if (ret != -EBUSY) ++ return ret; ++ } ++ return ret; ++} ++ ++/* Convenience wrapper: discard output arguments */ ++static int smapi_write(u32 inEBX, u32 inECX, ++ u32 inEDI, u32 inESI, const char **msg) ++{ ++ return smapi_request(inEBX, inECX, inEDI, inESI, ++ NULL, NULL, NULL, NULL, NULL, msg); ++} ++ ++ ++/********************************************************************* ++ * Specific SMAPI services ++ * All of these functions return 0 upon success, and a negative errno ++ * constant (see smapi_retcode) on failure. ++ */ ++ ++enum thresh_type { ++ THRESH_STOP = 0, /* the code assumes this is 0 for brevity */ ++ THRESH_START ++}; ++#define THRESH_NAME(which) ((which == THRESH_START) ? "start" : "stop") ++ ++/** ++ * __get_real_thresh - read battery charge start/stop threshold from SMAPI ++ * @bat: battery number (0 or 1) ++ * @which: THRESH_START or THRESH_STOP ++ * @thresh: 1..99, 0=default 1..99, 0=default (pass this as-is to SMAPI) ++ * @outEDI: some additional state that needs to be preserved, meaning unknown ++ * @outESI: some additional state that needs to be preserved, meaning unknown ++ */ ++static int __get_real_thresh(int bat, enum thresh_type which, int *thresh, ++ u32 *outEDI, u32 *outESI) ++{ ++ u32 ebx = (which == THRESH_START) ? SMAPI_GET_THRESH_START ++ : SMAPI_GET_THRESH_STOP; ++ u32 ecx = (bat+1)<<8; ++ const char *msg; ++ int ret = smapi_request(ebx, ecx, 0, 0, NULL, ++ &ecx, NULL, outEDI, outESI, &msg); ++ if (ret) { ++ TPRINTK(KERN_NOTICE, "cannot get %s_thresh of bat=%d: %s", ++ THRESH_NAME(which), bat, msg); ++ return ret; ++ } ++ if (!(ecx&0x00000100)) { ++ TPRINTK(KERN_NOTICE, "cannot get %s_thresh of bat=%d: ecx=0%x", ++ THRESH_NAME(which), bat, ecx); ++ return -EIO; ++ } ++ if (thresh) ++ *thresh = ecx&0xFF; ++ return 0; ++} ++ ++/** ++ * get_real_thresh - read battery charge start/stop threshold from SMAPI ++ * @bat: battery number (0 or 1) ++ * @which: THRESH_START or THRESH_STOP ++ * @thresh: 1..99, 0=default (passes as-is to SMAPI) ++ */ ++static int get_real_thresh(int bat, enum thresh_type which, int *thresh) ++{ ++ return __get_real_thresh(bat, which, thresh, NULL, NULL); ++} ++ ++/** ++ * set_real_thresh - write battery start/top charge threshold to SMAPI ++ * @bat: battery number (0 or 1) ++ * @which: THRESH_START or THRESH_STOP ++ * @thresh: 1..99, 0=default (passes as-is to SMAPI) ++ */ ++static int set_real_thresh(int bat, enum thresh_type which, int thresh) ++{ ++ u32 ebx = (which == THRESH_START) ? SMAPI_SET_THRESH_START ++ : SMAPI_SET_THRESH_STOP; ++ u32 ecx = ((bat+1)<<8) + thresh; ++ u32 getDI, getSI; ++ const char *msg; ++ int ret; ++ ++ /* verify read before writing */ ++ ret = __get_real_thresh(bat, which, NULL, &getDI, &getSI); ++ if (ret) ++ return ret; ++ ++ ret = smapi_write(ebx, ecx, getDI, getSI, &msg); ++ if (ret) ++ TPRINTK(KERN_NOTICE, "set %s to %d for bat=%d failed: %s", ++ THRESH_NAME(which), thresh, bat, msg); ++ else ++ TPRINTK(KERN_INFO, "set %s to %d for bat=%d", ++ THRESH_NAME(which), thresh, bat); ++ return ret; ++} ++ ++/** ++ * __get_inhibit_charge_minutes - get inhibit charge period from SMAPI ++ * @bat: battery number (0 or 1) ++ * @minutes: period in minutes (1..65535 minutes, 0=disabled) ++ * @outECX: some additional state that needs to be preserved, meaning unknown ++ * Note that @minutes is the originally set value, it does not count down. ++ */ ++static int __get_inhibit_charge_minutes(int bat, int *minutes, u32 *outECX) ++{ ++ u32 ecx = (bat+1)<<8; ++ u32 esi; ++ const char *msg; ++ int ret = smapi_request(SMAPI_GET_INHIBIT_CHARGE, ecx, 0, 0, ++ NULL, &ecx, NULL, NULL, &esi, &msg); ++ if (ret) { ++ TPRINTK(KERN_NOTICE, "failed for bat=%d: %s", bat, msg); ++ return ret; ++ } ++ if (!(ecx&0x0100)) { ++ TPRINTK(KERN_NOTICE, "bad ecx=0x%x for bat=%d", ecx, bat); ++ return -EIO; ++ } ++ if (minutes) ++ *minutes = (ecx&0x0001)?esi:0; ++ if (outECX) ++ *outECX = ecx; ++ return 0; ++} ++ ++/** ++ * get_inhibit_charge_minutes - get inhibit charge period from SMAPI ++ * @bat: battery number (0 or 1) ++ * @minutes: period in minutes (1..65535 minutes, 0=disabled) ++ * Note that @minutes is the originally set value, it does not count down. ++ */ ++static int get_inhibit_charge_minutes(int bat, int *minutes) ++{ ++ return __get_inhibit_charge_minutes(bat, minutes, NULL); ++} ++ ++/** ++ * set_inhibit_charge_minutes - write inhibit charge period to SMAPI ++ * @bat: battery number (0 or 1) ++ * @minutes: period in minutes (1..65535 minutes, 0=disabled) ++ */ ++static int set_inhibit_charge_minutes(int bat, int minutes) ++{ ++ u32 ecx; ++ const char *msg; ++ int ret; ++ ++ /* verify read before writing */ ++ ret = __get_inhibit_charge_minutes(bat, NULL, &ecx); ++ if (ret) ++ return ret; ++ ++ ecx = ((bat+1)<<8) | (ecx&0x00FE) | (minutes > 0 ? 0x0001 : 0x0000); ++ if (minutes > 0xFFFF) ++ minutes = 0xFFFF; ++ ret = smapi_write(SMAPI_SET_INHIBIT_CHARGE, ecx, 0, minutes, &msg); ++ if (ret) ++ TPRINTK(KERN_NOTICE, ++ "set to %d failed for bat=%d: %s", minutes, bat, msg); ++ else ++ TPRINTK(KERN_INFO, "set to %d for bat=%d\n", minutes, bat); ++ return ret; ++} ++ ++ ++/** ++ * get_force_discharge - get status of forced discharging from SMAPI ++ * @bat: battery number (0 or 1) ++ * @enabled: 1 if forced discharged is enabled, 0 if not ++ */ ++static int get_force_discharge(int bat, int *enabled) ++{ ++ u32 ecx = (bat+1)<<8; ++ const char *msg; ++ int ret = smapi_request(SMAPI_GET_FORCE_DISCHARGE, ecx, 0, 0, ++ NULL, &ecx, NULL, NULL, NULL, &msg); ++ if (ret) { ++ TPRINTK(KERN_NOTICE, "failed for bat=%d: %s", bat, msg); ++ return ret; ++ } ++ *enabled = (!(ecx&0x00000100) && (ecx&0x00000001))?1:0; ++ return 0; ++} ++ ++/** ++ * set_force_discharge - write status of forced discharging to SMAPI ++ * @bat: battery number (0 or 1) ++ * @enabled: 1 if forced discharged is enabled, 0 if not ++ */ ++static int set_force_discharge(int bat, int enabled) ++{ ++ u32 ecx = (bat+1)<<8; ++ const char *msg; ++ int ret = smapi_request(SMAPI_GET_FORCE_DISCHARGE, ecx, 0, 0, ++ NULL, &ecx, NULL, NULL, NULL, &msg); ++ if (ret) { ++ TPRINTK(KERN_NOTICE, "get failed for bat=%d: %s", bat, msg); ++ return ret; ++ } ++ if (ecx&0x00000100) { ++ TPRINTK(KERN_NOTICE, "cannot force discharge bat=%d", bat); ++ return -EIO; ++ } ++ ++ ecx = ((bat+1)<<8) | (ecx&0x000000FA) | (enabled?0x00000001:0); ++ ret = smapi_write(SMAPI_SET_FORCE_DISCHARGE, ecx, 0, 0, &msg); ++ if (ret) ++ TPRINTK(KERN_NOTICE, "set to %d failed for bat=%d: %s", ++ enabled, bat, msg); ++ else ++ TPRINTK(KERN_INFO, "set to %d for bat=%d", enabled, bat); ++ return ret; ++} ++ ++ ++/********************************************************************* ++ * Wrappers to threshold-related SMAPI functions, which handle default ++ * thresholds and related quirks. ++ */ ++ ++/* Minimum, default and minimum difference for battery charging thresholds: */ ++#define MIN_THRESH_DELTA 4 /* Min delta between start and stop thresh */ ++#define MIN_THRESH_START 2 ++#define MAX_THRESH_START (100-MIN_THRESH_DELTA) ++#define MIN_THRESH_STOP (MIN_THRESH_START + MIN_THRESH_DELTA) ++#define MAX_THRESH_STOP 100 ++#define DEFAULT_THRESH_START MAX_THRESH_START ++#define DEFAULT_THRESH_STOP MAX_THRESH_STOP ++ ++/* The GUI of IBM's Battery Maximizer seems to show a start threshold that ++ * is 1 more than the value we set/get via SMAPI. Since the threshold is ++ * maintained across reboot, this can be confusing. So we kludge our ++ * interface for interoperability: */ ++#define BATMAX_FIX 1 ++ ++/* Get charge start/stop threshold (1..100), ++ * substituting default values if needed and applying BATMAT_FIX. */ ++static int get_thresh(int bat, enum thresh_type which, int *thresh) ++{ ++ int ret = get_real_thresh(bat, which, thresh); ++ if (ret) ++ return ret; ++ if (*thresh == 0) ++ *thresh = (which == THRESH_START) ? DEFAULT_THRESH_START ++ : DEFAULT_THRESH_STOP; ++ else if (which == THRESH_START) ++ *thresh += BATMAX_FIX; ++ return 0; ++} ++ ++ ++/* Set charge start/stop threshold (1..100), ++ * substituting default values if needed and applying BATMAT_FIX. */ ++static int set_thresh(int bat, enum thresh_type which, int thresh) ++{ ++ if (which == THRESH_STOP && thresh == DEFAULT_THRESH_STOP) ++ thresh = 0; /* 100 is out of range, but default means 100 */ ++ if (which == THRESH_START) ++ thresh -= BATMAX_FIX; ++ return set_real_thresh(bat, which, thresh); ++} ++ ++/********************************************************************* ++ * ThinkPad embedded controller readout and basic functions ++ */ ++ ++/** ++ * read_tp_ec_row - read data row from the ThinkPad embedded controller ++ * @arg0: EC command code ++ * @bat: battery number, 0 or 1 ++ * @j: the byte value to be used for "junk" (unused) input/outputs ++ * @dataval: result vector ++ */ ++static int read_tp_ec_row(u8 arg0, int bat, u8 j, u8 *dataval) ++{ ++ int ret; ++ const struct thinkpad_ec_row args = { .mask = 0xFFFF, ++ .val = {arg0, j,j,j,j,j,j,j,j,j,j,j,j,j,j, (u8)bat} }; ++ struct thinkpad_ec_row data = { .mask = 0xFFFF }; ++ ++ ret = thinkpad_ec_lock(); ++ if (ret) ++ return ret; ++ ret = thinkpad_ec_read_row(&args, &data); ++ thinkpad_ec_unlock(); ++ memcpy(dataval, &data.val, TP_CONTROLLER_ROW_LEN); ++ return ret; ++} ++ ++/** ++ * power_device_present - check for presence of battery or AC power ++ * @bat: 0 for battery 0, 1 for battery 1, otherwise AC power ++ * Returns 1 if present, 0 if not present, negative if error. ++ */ ++static int power_device_present(int bat) ++{ ++ u8 row[TP_CONTROLLER_ROW_LEN]; ++ u8 test; ++ int ret = read_tp_ec_row(1, bat, 0, row); ++ if (ret) ++ return ret; ++ switch (bat) { ++ case 0: test = 0x40; break; /* battery 0 */ ++ case 1: test = 0x20; break; /* battery 1 */ ++ default: test = 0x80; /* AC power */ ++ } ++ return (row[0] & test) ? 1 : 0; ++} ++ ++/** ++ * bat_has_status - check if battery can report detailed status ++ * @bat: 0 for battery 0, 1 for battery 1 ++ * Returns 1 if yes, 0 if no, negative if error. ++ */ ++static int bat_has_status(int bat) ++{ ++ u8 row[TP_CONTROLLER_ROW_LEN]; ++ int ret = read_tp_ec_row(1, bat, 0, row); ++ if (ret) ++ return ret; ++ if ((row[0] & (bat?0x20:0x40)) == 0) /* no battery */ ++ return 0; ++ if ((row[1] & (0x60)) == 0) /* no status */ ++ return 0; ++ return 1; ++} ++ ++/** ++ * get_tp_ec_bat_16 - read a 16-bit value from EC battery status data ++ * @arg0: first argument to EC ++ * @off: offset in row returned from EC ++ * @bat: battery (0 or 1) ++ * @val: the 16-bit value obtained ++ * Returns nonzero on error. ++ */ ++static int get_tp_ec_bat_16(u8 arg0, int offset, int bat, u16 *val) ++{ ++ u8 row[TP_CONTROLLER_ROW_LEN]; ++ int ret; ++ if (bat_has_status(bat) != 1) ++ return -ENXIO; ++ ret = read_tp_ec_row(arg0, bat, 0, row); ++ if (ret) ++ return ret; ++ *val = *(u16 *)(row+offset); ++ return 0; ++} ++ ++/********************************************************************* ++ * sysfs attributes for batteries - ++ * definitions and helper functions ++ */ ++ ++/* A custom device attribute struct which holds a battery number */ ++struct bat_device_attribute { ++ struct device_attribute dev_attr; ++ int bat; ++}; ++ ++/** ++ * attr_get_bat - get the battery to which the attribute belongs ++ */ ++static int attr_get_bat(struct device_attribute *attr) ++{ ++ return container_of(attr, struct bat_device_attribute, dev_attr)->bat; ++} ++ ++/** ++ * show_tp_ec_bat_u16 - show an unsigned 16-bit battery attribute ++ * @arg0: specified 1st argument of EC raw to read ++ * @offset: byte offset in EC raw data ++ * @mul: correction factor to multiply by ++ * @na_msg: string to output is value not available (0xFFFFFFFF) ++ * @attr: battery attribute ++ * @buf: output buffer ++ * The 16-bit value is read from the EC, treated as unsigned, ++ * transformed as x->mul*x, and printed to the buffer. ++ * If the value is 0xFFFFFFFF and na_msg!=%NULL, na_msg is printed instead. ++ */ ++static ssize_t show_tp_ec_bat_u16(u8 arg0, int offset, int mul, ++ const char *na_msg, ++ struct device_attribute *attr, char *buf) ++{ ++ u16 val; ++ int ret = get_tp_ec_bat_16(arg0, offset, attr_get_bat(attr), &val); ++ if (ret) ++ return ret; ++ if (na_msg && val == 0xFFFF) ++ return sprintf(buf, "%s\n", na_msg); ++ else ++ return sprintf(buf, "%u\n", mul*(unsigned int)val); ++} ++ ++/** ++ * show_tp_ec_bat_s16 - show an signed 16-bit battery attribute ++ * @arg0: specified 1st argument of EC raw to read ++ * @offset: byte offset in EC raw data ++ * @mul: correction factor to multiply by ++ * @add: correction term to add after multiplication ++ * @attr: battery attribute ++ * @buf: output buffer ++ * The 16-bit value is read from the EC, treated as signed, ++ * transformed as x->mul*x+add, and printed to the buffer. ++ */ ++static ssize_t show_tp_ec_bat_s16(u8 arg0, int offset, int mul, int add, ++ struct device_attribute *attr, char *buf) ++{ ++ u16 val; ++ int ret = get_tp_ec_bat_16(arg0, offset, attr_get_bat(attr), &val); ++ if (ret) ++ return ret; ++ return sprintf(buf, "%d\n", mul*(s16)val+add); ++} ++ ++/** ++ * show_tp_ec_bat_str - show a string from EC battery status data ++ * @arg0: specified 1st argument of EC raw to read ++ * @offset: byte offset in EC raw data ++ * @maxlen: maximum string length ++ * @attr: battery attribute ++ * @buf: output buffer ++ */ ++static ssize_t show_tp_ec_bat_str(u8 arg0, int offset, int maxlen, ++ struct device_attribute *attr, char *buf) ++{ ++ int bat = attr_get_bat(attr); ++ u8 row[TP_CONTROLLER_ROW_LEN]; ++ int ret; ++ if (bat_has_status(bat) != 1) ++ return -ENXIO; ++ ret = read_tp_ec_row(arg0, bat, 0, row); ++ if (ret) ++ return ret; ++ strncpy(buf, (char *)row+offset, maxlen); ++ buf[maxlen] = 0; ++ strcat(buf, "\n"); ++ return strlen(buf); ++} ++ ++/** ++ * show_tp_ec_bat_power - show a power readout from EC battery status data ++ * @arg0: specified 1st argument of EC raw to read ++ * @offV: byte offset of voltage in EC raw data ++ * @offI: byte offset of current in EC raw data ++ * @attr: battery attribute ++ * @buf: output buffer ++ * Computes the power as current*voltage from the two given readout offsets. ++ */ ++static ssize_t show_tp_ec_bat_power(u8 arg0, int offV, int offI, ++ struct device_attribute *attr, char *buf) ++{ ++ u8 row[TP_CONTROLLER_ROW_LEN]; ++ int milliamp, millivolt, ret; ++ int bat = attr_get_bat(attr); ++ if (bat_has_status(bat) != 1) ++ return -ENXIO; ++ ret = read_tp_ec_row(1, bat, 0, row); ++ if (ret) ++ return ret; ++ millivolt = *(u16 *)(row+offV); ++ milliamp = *(s16 *)(row+offI); ++ return sprintf(buf, "%d\n", milliamp*millivolt/1000); /* units: mW */ ++} ++ ++/** ++ * show_tp_ec_bat_date - decode and show a date from EC battery status data ++ * @arg0: specified 1st argument of EC raw to read ++ * @offset: byte offset in EC raw data ++ * @attr: battery attribute ++ * @buf: output buffer ++ */ ++static ssize_t show_tp_ec_bat_date(u8 arg0, int offset, ++ struct device_attribute *attr, char *buf) ++{ ++ u8 row[TP_CONTROLLER_ROW_LEN]; ++ u16 v; ++ int ret; ++ int day, month, year; ++ int bat = attr_get_bat(attr); ++ if (bat_has_status(bat) != 1) ++ return -ENXIO; ++ ret = read_tp_ec_row(arg0, bat, 0, row); ++ if (ret) ++ return ret; ++ ++ /* Decode bit-packed: v = day | (month<<5) | ((year-1980)<<9) */ ++ v = *(u16 *)(row+offset); ++ day = v & 0x1F; ++ month = (v >> 5) & 0xF; ++ year = (v >> 9) + 1980; ++ ++ return sprintf(buf, "%04d-%02d-%02d\n", year, month, day); ++} ++ ++ ++/********************************************************************* ++ * sysfs attribute I/O for batteries - ++ * the actual attribute show/store functions ++ */ ++ ++static ssize_t show_battery_start_charge_thresh(struct device *dev, ++ struct device_attribute *attr, char *buf) ++{ ++ int thresh; ++ int bat = attr_get_bat(attr); ++ int ret = get_thresh(bat, THRESH_START, &thresh); ++ if (ret) ++ return ret; ++ return sprintf(buf, "%d\n", thresh); /* units: percent */ ++} ++ ++static ssize_t show_battery_stop_charge_thresh(struct device *dev, ++ struct device_attribute *attr, char *buf) ++{ ++ int thresh; ++ int bat = attr_get_bat(attr); ++ int ret = get_thresh(bat, THRESH_STOP, &thresh); ++ if (ret) ++ return ret; ++ return sprintf(buf, "%d\n", thresh); /* units: percent */ ++} ++ ++/** ++ * store_battery_start_charge_thresh - store battery_start_charge_thresh attr ++ * Since this is a kernel<->user interface, we ensure a valid state for ++ * the hardware. We do this by clamping the requested threshold to the ++ * valid range and, if necessary, moving the other threshold so that ++ * it's MIN_THRESH_DELTA away from this one. ++ */ ++static ssize_t store_battery_start_charge_thresh(struct device *dev, ++ struct device_attribute *attr, const char *buf, size_t count) ++{ ++ int thresh, other_thresh, ret; ++ int bat = attr_get_bat(attr); ++ ++ if (sscanf(buf, "%d", &thresh) != 1 || thresh < 1 || thresh > 100) ++ return -EINVAL; ++ ++ if (thresh < MIN_THRESH_START) /* clamp up to MIN_THRESH_START */ ++ thresh = MIN_THRESH_START; ++ if (thresh > MAX_THRESH_START) /* clamp down to MAX_THRESH_START */ ++ thresh = MAX_THRESH_START; ++ ++ down(&smapi_mutex); ++ ret = get_thresh(bat, THRESH_STOP, &other_thresh); ++ if (ret != -EOPNOTSUPP && ret != -ENXIO) { ++ if (ret) /* other threshold is set? */ ++ goto out; ++ ret = get_real_thresh(bat, THRESH_START, NULL); ++ if (ret) /* this threshold is set? */ ++ goto out; ++ if (other_thresh < thresh+MIN_THRESH_DELTA) { ++ /* move other thresh to keep it above this one */ ++ ret = set_thresh(bat, THRESH_STOP, ++ thresh+MIN_THRESH_DELTA); ++ if (ret) ++ goto out; ++ } ++ } ++ ret = set_thresh(bat, THRESH_START, thresh); ++out: ++ up(&smapi_mutex); ++ return count; ++ ++} ++ ++/** ++ * store_battery_stop_charge_thresh - store battery_stop_charge_thresh attr ++ * Since this is a kernel<->user interface, we ensure a valid state for ++ * the hardware. We do this by clamping the requested threshold to the ++ * valid range and, if necessary, moving the other threshold so that ++ * it's MIN_THRESH_DELTA away from this one. ++ */ ++static ssize_t store_battery_stop_charge_thresh(struct device *dev, ++ struct device_attribute *attr, const char *buf, size_t count) ++{ ++ int thresh, other_thresh, ret; ++ int bat = attr_get_bat(attr); ++ ++ if (sscanf(buf, "%d", &thresh) != 1 || thresh < 1 || thresh > 100) ++ return -EINVAL; ++ ++ if (thresh < MIN_THRESH_STOP) /* clamp up to MIN_THRESH_STOP */ ++ thresh = MIN_THRESH_STOP; ++ ++ down(&smapi_mutex); ++ ret = get_thresh(bat, THRESH_START, &other_thresh); ++ if (ret != -EOPNOTSUPP && ret != -ENXIO) { /* other threshold exists? */ ++ if (ret) ++ goto out; ++ /* this threshold exists? */ ++ ret = get_real_thresh(bat, THRESH_STOP, NULL); ++ if (ret) ++ goto out; ++ if (other_thresh >= thresh-MIN_THRESH_DELTA) { ++ /* move other thresh to be below this one */ ++ ret = set_thresh(bat, THRESH_START, ++ thresh-MIN_THRESH_DELTA); ++ if (ret) ++ goto out; ++ } ++ } ++ ret = set_thresh(bat, THRESH_STOP, thresh); ++out: ++ up(&smapi_mutex); ++ return count; ++} ++ ++static ssize_t show_battery_inhibit_charge_minutes(struct device *dev, ++ struct device_attribute *attr, char *buf) ++{ ++ int minutes; ++ int bat = attr_get_bat(attr); ++ int ret = get_inhibit_charge_minutes(bat, &minutes); ++ if (ret) ++ return ret; ++ return sprintf(buf, "%d\n", minutes); /* units: minutes */ ++} ++ ++static ssize_t store_battery_inhibit_charge_minutes(struct device *dev, ++ struct device_attribute *attr, ++ const char *buf, size_t count) ++{ ++ int ret; ++ int minutes; ++ int bat = attr_get_bat(attr); ++ if (sscanf(buf, "%d", &minutes) != 1 || minutes < 0) { ++ TPRINTK(KERN_ERR, "inhibit_charge_minutes: " ++ "must be a non-negative integer"); ++ return -EINVAL; ++ } ++ ret = set_inhibit_charge_minutes(bat, minutes); ++ if (ret) ++ return ret; ++ return count; ++} ++ ++static ssize_t show_battery_force_discharge(struct device *dev, ++ struct device_attribute *attr, char *buf) ++{ ++ int enabled; ++ int bat = attr_get_bat(attr); ++ int ret = get_force_discharge(bat, &enabled); ++ if (ret) ++ return ret; ++ return sprintf(buf, "%d\n", enabled); /* type: boolean */ ++} ++ ++static ssize_t store_battery_force_discharge(struct device *dev, ++ struct device_attribute *attr, const char *buf, size_t count) ++{ ++ int ret; ++ int enabled; ++ int bat = attr_get_bat(attr); ++ if (sscanf(buf, "%d", &enabled) != 1 || enabled < 0 || enabled > 1) ++ return -EINVAL; ++ ret = set_force_discharge(bat, enabled); ++ if (ret) ++ return ret; ++ return count; ++} ++ ++static ssize_t show_battery_installed( ++ struct device *dev, struct device_attribute *attr, char *buf) ++{ ++ int bat = attr_get_bat(attr); ++ int ret = power_device_present(bat); ++ if (ret < 0) ++ return ret; ++ return sprintf(buf, "%d\n", ret); /* type: boolean */ ++} ++ ++static ssize_t show_battery_state( ++ struct device *dev, struct device_attribute *attr, char *buf) ++{ ++ u8 row[TP_CONTROLLER_ROW_LEN]; ++ const char *txt; ++ int ret; ++ int bat = attr_get_bat(attr); ++ if (bat_has_status(bat) != 1) ++ return sprintf(buf, "none\n"); ++ ret = read_tp_ec_row(1, bat, 0, row); ++ if (ret) ++ return ret; ++ switch (row[1] & 0xf0) { ++ case 0xc0: txt = "idle"; break; ++ case 0xd0: txt = "discharging"; break; ++ case 0xe0: txt = "charging"; break; ++ default: return sprintf(buf, "unknown (0x%x)\n", row[1]); ++ } ++ return sprintf(buf, "%s\n", txt); /* type: string from fixed set */ ++} ++ ++static ssize_t show_battery_manufacturer( ++ struct device *dev, struct device_attribute *attr, char *buf) ++{ ++ /* type: string. SBS spec v1.1 p34: ManufacturerName() */ ++ return show_tp_ec_bat_str(4, 2, TP_CONTROLLER_ROW_LEN-2, attr, buf); ++} ++ ++static ssize_t show_battery_model( ++ struct device *dev, struct device_attribute *attr, char *buf) ++{ ++ /* type: string. SBS spec v1.1 p34: DeviceName() */ ++ return show_tp_ec_bat_str(5, 2, TP_CONTROLLER_ROW_LEN-2, attr, buf); ++} ++ ++static ssize_t show_battery_barcoding( ++ struct device *dev, struct device_attribute *attr, char *buf) ++{ ++ /* type: string */ ++ return show_tp_ec_bat_str(7, 2, TP_CONTROLLER_ROW_LEN-2, attr, buf); ++} ++ ++static ssize_t show_battery_chemistry( ++ struct device *dev, struct device_attribute *attr, char *buf) ++{ ++ /* type: string. SBS spec v1.1 p34-35: DeviceChemistry() */ ++ return show_tp_ec_bat_str(6, 2, 5, attr, buf); ++} ++ ++static ssize_t show_battery_voltage( ++ struct device *dev, struct device_attribute *attr, char *buf) ++{ ++ /* units: mV. SBS spec v1.1 p24: Voltage() */ ++ return show_tp_ec_bat_u16(1, 6, 1, NULL, attr, buf); ++} ++ ++static ssize_t show_battery_design_voltage( ++ struct device *dev, struct device_attribute *attr, char *buf) ++{ ++ /* units: mV. SBS spec v1.1 p32: DesignVoltage() */ ++ return show_tp_ec_bat_u16(3, 4, 1, NULL, attr, buf); ++} ++ ++static ssize_t show_battery_charging_max_voltage( ++ struct device *dev, struct device_attribute *attr, char *buf) ++{ ++ /* units: mV. SBS spec v1.1 p37,39: ChargingVoltage() */ ++ return show_tp_ec_bat_u16(9, 8, 1, NULL, attr, buf); ++} ++ ++static ssize_t show_battery_group0_voltage( ++ struct device *dev, struct device_attribute *attr, char *buf) ++{ ++ /* units: mV */ ++ return show_tp_ec_bat_u16(0xA, 12, 1, NULL, attr, buf); ++} ++ ++static ssize_t show_battery_group1_voltage( ++ struct device *dev, struct device_attribute *attr, char *buf) ++{ ++ /* units: mV */ ++ return show_tp_ec_bat_u16(0xA, 10, 1, NULL, attr, buf); ++} ++ ++static ssize_t show_battery_group2_voltage( ++ struct device *dev, struct device_attribute *attr, char *buf) ++{ ++ /* units: mV */ ++ return show_tp_ec_bat_u16(0xA, 8, 1, NULL, attr, buf); ++} ++ ++static ssize_t show_battery_group3_voltage( ++ struct device *dev, struct device_attribute *attr, char *buf) ++{ ++ /* units: mV */ ++ return show_tp_ec_bat_u16(0xA, 6, 1, NULL, attr, buf); ++} ++ ++static ssize_t show_battery_current_now( ++ struct device *dev, struct device_attribute *attr, char *buf) ++{ ++ /* units: mA. SBS spec v1.1 p24: Current() */ ++ return show_tp_ec_bat_s16(1, 8, 1, 0, attr, buf); ++} ++ ++static ssize_t show_battery_current_avg( ++ struct device *dev, struct device_attribute *attr, char *buf) ++{ ++ /* units: mA. SBS spec v1.1 p24: AverageCurrent() */ ++ return show_tp_ec_bat_s16(1, 10, 1, 0, attr, buf); ++} ++ ++static ssize_t show_battery_charging_max_current( ++ struct device *dev, struct device_attribute *attr, char *buf) ++{ ++ /* units: mA. SBS spec v1.1 p36,38: ChargingCurrent() */ ++ return show_tp_ec_bat_s16(9, 6, 1, 0, attr, buf); ++} ++ ++static ssize_t show_battery_power_now( ++ struct device *dev, struct device_attribute *attr, char *buf) ++{ ++ /* units: mW. SBS spec v1.1: Voltage()*Current() */ ++ return show_tp_ec_bat_power(1, 6, 8, attr, buf); ++} ++ ++static ssize_t show_battery_power_avg( ++ struct device *dev, struct device_attribute *attr, char *buf) ++{ ++ /* units: mW. SBS spec v1.1: Voltage()*AverageCurrent() */ ++ return show_tp_ec_bat_power(1, 6, 10, attr, buf); ++} ++ ++static ssize_t show_battery_remaining_percent( ++ struct device *dev, struct device_attribute *attr, char *buf) ++{ ++ /* units: percent. SBS spec v1.1 p25: RelativeStateOfCharge() */ ++ return show_tp_ec_bat_u16(1, 12, 1, NULL, attr, buf); ++} ++ ++static ssize_t show_battery_remaining_percent_error( ++ struct device *dev, struct device_attribute *attr, char *buf) ++{ ++ /* units: percent. SBS spec v1.1 p25: MaxError() */ ++ return show_tp_ec_bat_u16(9, 4, 1, NULL, attr, buf); ++} ++ ++static ssize_t show_battery_remaining_charging_time( ++ struct device *dev, struct device_attribute *attr, char *buf) ++{ ++ /* units: minutes. SBS spec v1.1 p27: AverageTimeToFull() */ ++ return show_tp_ec_bat_u16(2, 8, 1, "not_charging", attr, buf); ++} ++ ++static ssize_t show_battery_remaining_running_time( ++ struct device *dev, struct device_attribute *attr, char *buf) ++{ ++ /* units: minutes. SBS spec v1.1 p27: RunTimeToEmpty() */ ++ return show_tp_ec_bat_u16(2, 6, 1, "not_discharging", attr, buf); ++} ++ ++static ssize_t show_battery_remaining_running_time_now( ++ struct device *dev, struct device_attribute *attr, char *buf) ++{ ++ /* units: minutes. SBS spec v1.1 p27: RunTimeToEmpty() */ ++ return show_tp_ec_bat_u16(2, 4, 1, "not_discharging", attr, buf); ++} ++ ++static ssize_t show_battery_remaining_capacity( ++ struct device *dev, struct device_attribute *attr, char *buf) ++{ ++ /* units: mWh. SBS spec v1.1 p26. */ ++ return show_tp_ec_bat_u16(1, 14, 10, "", attr, buf); ++} ++ ++static ssize_t show_battery_last_full_capacity( ++ struct device *dev, struct device_attribute *attr, char *buf) ++{ ++ /* units: mWh. SBS spec v1.1 p26: FullChargeCapacity() */ ++ return show_tp_ec_bat_u16(2, 2, 10, "", attr, buf); ++} ++ ++static ssize_t show_battery_design_capacity( ++ struct device *dev, struct device_attribute *attr, char *buf) ++{ ++ /* units: mWh. SBS spec v1.1 p32: DesignCapacity() */ ++ return show_tp_ec_bat_u16(3, 2, 10, "", attr, buf); ++} ++ ++static ssize_t show_battery_cycle_count( ++ struct device *dev, struct device_attribute *attr, char *buf) ++{ ++ /* units: ordinal. SBS spec v1.1 p32: CycleCount() */ ++ return show_tp_ec_bat_u16(2, 12, 1, "", attr, buf); ++} ++ ++static ssize_t show_battery_temperature( ++ struct device *dev, struct device_attribute *attr, char *buf) ++{ ++ /* units: millicelsius. SBS spec v1.1: Temperature()*10 */ ++ return show_tp_ec_bat_s16(1, 4, 100, -273100, attr, buf); ++} ++ ++static ssize_t show_battery_serial( ++ struct device *dev, struct device_attribute *attr, char *buf) ++{ ++ /* type: int. SBS spec v1.1 p34: SerialNumber() */ ++ return show_tp_ec_bat_u16(3, 10, 1, "", attr, buf); ++} ++ ++static ssize_t show_battery_manufacture_date( ++ struct device *dev, struct device_attribute *attr, char *buf) ++{ ++ /* type: YYYY-MM-DD. SBS spec v1.1 p34: ManufactureDate() */ ++ return show_tp_ec_bat_date(3, 8, attr, buf); ++} ++ ++static ssize_t show_battery_first_use_date( ++ struct device *dev, struct device_attribute *attr, char *buf) ++{ ++ /* type: YYYY-MM-DD */ ++ return show_tp_ec_bat_date(8, 2, attr, buf); ++} ++ ++/** ++ * show_battery_dump - show the battery's dump attribute ++ * The dump attribute gives a hex dump of all EC readouts related to a ++ * battery. Some of the enumerated values don't really exist (i.e., the ++ * EC function just leaves them untouched); we use a kludge to detect and ++ * denote these. ++ */ ++#define MIN_DUMP_ARG0 0x00 ++#define MAX_DUMP_ARG0 0x0a /* 0x0b is useful too but hangs old EC firmware */ ++static ssize_t show_battery_dump( ++ struct device *dev, struct device_attribute *attr, char *buf) ++{ ++ int i; ++ char *p = buf; ++ int bat = attr_get_bat(attr); ++ u8 arg0; /* first argument to EC */ ++ u8 rowa[TP_CONTROLLER_ROW_LEN], ++ rowb[TP_CONTROLLER_ROW_LEN]; ++ const u8 junka = 0xAA, ++ junkb = 0x55; /* junk values for testing changes */ ++ int ret; ++ ++ for (arg0 = MIN_DUMP_ARG0; arg0 <= MAX_DUMP_ARG0; ++arg0) { ++ if ((p-buf) > PAGE_SIZE-TP_CONTROLLER_ROW_LEN*5) ++ return -ENOMEM; /* don't overflow sysfs buf */ ++ /* Read raw twice with different junk values, ++ * to detect unused output bytes which are left unchaged: */ ++ ret = read_tp_ec_row(arg0, bat, junka, rowa); ++ if (ret) ++ return ret; ++ ret = read_tp_ec_row(arg0, bat, junkb, rowb); ++ if (ret) ++ return ret; ++ for (i = 0; i < TP_CONTROLLER_ROW_LEN; i++) { ++ if (rowa[i] == junka && rowb[i] == junkb) ++ p += sprintf(p, "-- "); /* unused by EC */ ++ else ++ p += sprintf(p, "%02x ", rowa[i]); ++ } ++ p += sprintf(p, "\n"); ++ } ++ return p-buf; ++} ++ ++ ++/********************************************************************* ++ * sysfs attribute I/O, other than batteries ++ */ ++ ++static ssize_t show_ac_connected( ++ struct device *dev, struct device_attribute *attr, char *buf) ++{ ++ int ret = power_device_present(0xFF); ++ if (ret < 0) ++ return ret; ++ return sprintf(buf, "%d\n", ret); /* type: boolean */ ++} ++ ++/********************************************************************* ++ * The the "smapi_request" sysfs attribute executes a raw SMAPI call. ++ * You write to make a request and read to get the result. The state ++ * is saved globally rather than per fd (sysfs limitation), so ++ * simultaenous requests may get each other's results! So this is for ++ * development and debugging only. ++ */ ++#define MAX_SMAPI_ATTR_ANSWER_LEN 128 ++static char smapi_attr_answer[MAX_SMAPI_ATTR_ANSWER_LEN] = ""; ++ ++static ssize_t show_smapi_request(struct device *dev, ++ struct device_attribute *attr, char *buf) ++{ ++ int ret = snprintf(buf, PAGE_SIZE, "%s", smapi_attr_answer); ++ smapi_attr_answer[0] = '\0'; ++ return ret; ++} ++ ++static ssize_t store_smapi_request(struct device *dev, ++ struct device_attribute *attr, ++ const char *buf, size_t count) ++{ ++ unsigned int inEBX, inECX, inEDI, inESI; ++ u32 outEBX, outECX, outEDX, outEDI, outESI; ++ const char *msg; ++ int ret; ++ if (sscanf(buf, "%x %x %x %x", &inEBX, &inECX, &inEDI, &inESI) != 4) { ++ smapi_attr_answer[0] = '\0'; ++ return -EINVAL; ++ } ++ ret = smapi_request( ++ inEBX, inECX, inEDI, inESI, ++ &outEBX, &outECX, &outEDX, &outEDI, &outESI, &msg); ++ snprintf(smapi_attr_answer, MAX_SMAPI_ATTR_ANSWER_LEN, ++ "%x %x %x %x %x %d '%s'\n", ++ (unsigned int)outEBX, (unsigned int)outECX, ++ (unsigned int)outEDX, (unsigned int)outEDI, ++ (unsigned int)outESI, ret, msg); ++ if (ret) ++ return ret; ++ else ++ return count; ++} ++ ++/********************************************************************* ++ * Power management: the embedded controller forgets the battery ++ * thresholds when the system is suspended to disk and unplugged from ++ * AC and battery, so we restore it upon resume. ++ */ ++ ++static int saved_threshs[4] = {-1, -1, -1, -1}; /* -1 = don't know */ ++ ++static int tp_suspend(struct platform_device *dev, pm_message_t state) ++{ ++ int restore = (state.event == PM_EVENT_HIBERNATE || ++ state.event == PM_EVENT_FREEZE); ++ if (!restore || get_real_thresh(0, THRESH_STOP , &saved_threshs[0])) ++ saved_threshs[0] = -1; ++ if (!restore || get_real_thresh(0, THRESH_START, &saved_threshs[1])) ++ saved_threshs[1] = -1; ++ if (!restore || get_real_thresh(1, THRESH_STOP , &saved_threshs[2])) ++ saved_threshs[2] = -1; ++ if (!restore || get_real_thresh(1, THRESH_START, &saved_threshs[3])) ++ saved_threshs[3] = -1; ++ DPRINTK("suspend saved: %d %d %d %d", saved_threshs[0], ++ saved_threshs[1], saved_threshs[2], saved_threshs[3]); ++ return 0; ++} ++ ++static int tp_resume(struct platform_device *dev) ++{ ++ DPRINTK("resume restoring: %d %d %d %d", saved_threshs[0], ++ saved_threshs[1], saved_threshs[2], saved_threshs[3]); ++ if (saved_threshs[0] >= 0) ++ set_real_thresh(0, THRESH_STOP , saved_threshs[0]); ++ if (saved_threshs[1] >= 0) ++ set_real_thresh(0, THRESH_START, saved_threshs[1]); ++ if (saved_threshs[2] >= 0) ++ set_real_thresh(1, THRESH_STOP , saved_threshs[2]); ++ if (saved_threshs[3] >= 0) ++ set_real_thresh(1, THRESH_START, saved_threshs[3]); ++ return 0; ++} ++ ++ ++/********************************************************************* ++ * Driver model ++ */ ++ ++static struct platform_driver tp_driver = { ++ .suspend = tp_suspend, ++ .resume = tp_resume, ++ .driver = { ++ .name = "smapi", ++ .owner = THIS_MODULE ++ }, ++}; ++ ++ ++/********************************************************************* ++ * Sysfs device model ++ */ ++ ++/* Attributes in /sys/devices/platform/smapi/ */ ++ ++static DEVICE_ATTR(ac_connected, 0444, show_ac_connected, NULL); ++static DEVICE_ATTR(smapi_request, 0600, show_smapi_request, ++ store_smapi_request); ++ ++static struct attribute *tp_root_attributes[] = { ++ &dev_attr_ac_connected.attr, ++ &dev_attr_smapi_request.attr, ++ NULL ++}; ++static struct attribute_group tp_root_attribute_group = { ++ .attrs = tp_root_attributes ++}; ++ ++/* Attributes under /sys/devices/platform/smapi/BAT{0,1}/ : ++ * Every attribute needs to be defined (i.e., statically allocated) for ++ * each battery, and then referenced in the attribute list of each battery. ++ * We use preprocessor voodoo to avoid duplicating the list of attributes 4 ++ * times. The preprocessor output is just normal sysfs attributes code. ++ */ ++ ++/** ++ * FOREACH_BAT_ATTR - invoke the given macros on all our battery attributes ++ * @_BAT: battery number (0 or 1) ++ * @_ATTR_RW: macro to invoke for each read/write attribute ++ * @_ATTR_R: macro to invoke for each read-only attribute ++ */ ++#define FOREACH_BAT_ATTR(_BAT, _ATTR_RW, _ATTR_R) \ ++ _ATTR_RW(_BAT, start_charge_thresh) \ ++ _ATTR_RW(_BAT, stop_charge_thresh) \ ++ _ATTR_RW(_BAT, inhibit_charge_minutes) \ ++ _ATTR_RW(_BAT, force_discharge) \ ++ _ATTR_R(_BAT, installed) \ ++ _ATTR_R(_BAT, state) \ ++ _ATTR_R(_BAT, manufacturer) \ ++ _ATTR_R(_BAT, model) \ ++ _ATTR_R(_BAT, barcoding) \ ++ _ATTR_R(_BAT, chemistry) \ ++ _ATTR_R(_BAT, voltage) \ ++ _ATTR_R(_BAT, group0_voltage) \ ++ _ATTR_R(_BAT, group1_voltage) \ ++ _ATTR_R(_BAT, group2_voltage) \ ++ _ATTR_R(_BAT, group3_voltage) \ ++ _ATTR_R(_BAT, current_now) \ ++ _ATTR_R(_BAT, current_avg) \ ++ _ATTR_R(_BAT, charging_max_current) \ ++ _ATTR_R(_BAT, power_now) \ ++ _ATTR_R(_BAT, power_avg) \ ++ _ATTR_R(_BAT, remaining_percent) \ ++ _ATTR_R(_BAT, remaining_percent_error) \ ++ _ATTR_R(_BAT, remaining_charging_time) \ ++ _ATTR_R(_BAT, remaining_running_time) \ ++ _ATTR_R(_BAT, remaining_running_time_now) \ ++ _ATTR_R(_BAT, remaining_capacity) \ ++ _ATTR_R(_BAT, last_full_capacity) \ ++ _ATTR_R(_BAT, design_voltage) \ ++ _ATTR_R(_BAT, charging_max_voltage) \ ++ _ATTR_R(_BAT, design_capacity) \ ++ _ATTR_R(_BAT, cycle_count) \ ++ _ATTR_R(_BAT, temperature) \ ++ _ATTR_R(_BAT, serial) \ ++ _ATTR_R(_BAT, manufacture_date) \ ++ _ATTR_R(_BAT, first_use_date) \ ++ _ATTR_R(_BAT, dump) ++ ++/* Define several macros we will feed into FOREACH_BAT_ATTR: */ ++ ++#define DEFINE_BAT_ATTR_RW(_BAT,_NAME) \ ++ static struct bat_device_attribute dev_attr_##_NAME##_##_BAT = { \ ++ .dev_attr = __ATTR(_NAME, 0644, show_battery_##_NAME, \ ++ store_battery_##_NAME), \ ++ .bat = _BAT \ ++ }; ++ ++#define DEFINE_BAT_ATTR_R(_BAT,_NAME) \ ++ static struct bat_device_attribute dev_attr_##_NAME##_##_BAT = { \ ++ .dev_attr = __ATTR(_NAME, 0644, show_battery_##_NAME, 0), \ ++ .bat = _BAT \ ++ }; ++ ++#define REF_BAT_ATTR(_BAT,_NAME) \ ++ &dev_attr_##_NAME##_##_BAT.dev_attr.attr, ++ ++/* This provide all attributes for one battery: */ ++ ++#define PROVIDE_BAT_ATTRS(_BAT) \ ++ FOREACH_BAT_ATTR(_BAT, DEFINE_BAT_ATTR_RW, DEFINE_BAT_ATTR_R) \ ++ static struct attribute *tp_bat##_BAT##_attributes[] = { \ ++ FOREACH_BAT_ATTR(_BAT, REF_BAT_ATTR, REF_BAT_ATTR) \ ++ NULL \ ++ }; \ ++ static struct attribute_group tp_bat##_BAT##_attribute_group = { \ ++ .name = "BAT" #_BAT, \ ++ .attrs = tp_bat##_BAT##_attributes \ ++ }; ++ ++/* Finally genereate the attributes: */ ++ ++PROVIDE_BAT_ATTRS(0) ++PROVIDE_BAT_ATTRS(1) ++ ++/* List of attribute groups */ ++ ++static struct attribute_group *attr_groups[] = { ++ &tp_root_attribute_group, ++ &tp_bat0_attribute_group, ++ &tp_bat1_attribute_group, ++ NULL ++}; ++ ++ ++/********************************************************************* ++ * Init and cleanup ++ */ ++ ++static struct attribute_group **next_attr_group; /* next to register */ ++ ++static int __init tp_init(void) ++{ ++ int ret; ++ printk(KERN_INFO "tp_smapi " TP_VERSION " loading...\n"); ++ ++ ret = find_smapi_port(); ++ if (ret < 0) ++ goto err; ++ else ++ smapi_port = ret; ++ ++ if (!request_region(smapi_port, 1, "smapi")) { ++ printk(KERN_ERR "tp_smapi cannot claim port 0x%x\n", ++ smapi_port); ++ ret = -ENXIO; ++ goto err; ++ } ++ ++ if (!request_region(SMAPI_PORT2, 1, "smapi")) { ++ printk(KERN_ERR "tp_smapi cannot claim port 0x%x\n", ++ SMAPI_PORT2); ++ ret = -ENXIO; ++ goto err_port1; ++ } ++ ++ ret = platform_driver_register(&tp_driver); ++ if (ret) ++ goto err_port2; ++ ++ pdev = platform_device_alloc("smapi", -1); ++ if (!pdev) { ++ ret = -ENOMEM; ++ goto err_driver; ++ } ++ ++ ret = platform_device_add(pdev); ++ if (ret) ++ goto err_device_free; ++ ++ for (next_attr_group = attr_groups; *next_attr_group; ++ ++next_attr_group) { ++ ret = sysfs_create_group(&pdev->dev.kobj, *next_attr_group); ++ if (ret) ++ goto err_attr; ++ } ++ ++ printk(KERN_INFO "tp_smapi successfully loaded (smapi_port=0x%x).\n", ++ smapi_port); ++ return 0; ++ ++err_attr: ++ while (--next_attr_group >= attr_groups) ++ sysfs_remove_group(&pdev->dev.kobj, *next_attr_group); ++ platform_device_unregister(pdev); ++err_device_free: ++ platform_device_put(pdev); ++err_driver: ++ platform_driver_unregister(&tp_driver); ++err_port2: ++ release_region(SMAPI_PORT2, 1); ++err_port1: ++ release_region(smapi_port, 1); ++err: ++ printk(KERN_ERR "tp_smapi init failed (ret=%d)!\n", ret); ++ return ret; ++} ++ ++static void __exit tp_exit(void) ++{ ++ while (next_attr_group && --next_attr_group >= attr_groups) ++ sysfs_remove_group(&pdev->dev.kobj, *next_attr_group); ++ platform_device_unregister(pdev); ++ platform_driver_unregister(&tp_driver); ++ release_region(SMAPI_PORT2, 1); ++ if (smapi_port) ++ release_region(smapi_port, 1); ++ ++ printk(KERN_INFO "tp_smapi unloaded.\n"); ++} ++ ++module_init(tp_init); ++module_exit(tp_exit); +diff --git a/drivers/tty/Kconfig b/drivers/tty/Kconfig +index 0840d27381ea..73aba9a31064 100644 +--- a/drivers/tty/Kconfig ++++ b/drivers/tty/Kconfig +@@ -75,6 +75,19 @@ config VT_CONSOLE_SLEEP + def_bool y + depends on VT_CONSOLE && PM_SLEEP + ++config NR_TTY_DEVICES ++ int "Maximum tty device number" ++ depends on VT ++ range 12 63 ++ default 63 ++ ---help--- ++ This option is used to change the number of tty devices in /dev. ++ The default value is 63. The lowest number you can set is 12, ++ 63 is also the upper limit so we don't overrun the serial ++ consoles. ++ ++ If unsure, say 63. ++ + config HW_CONSOLE + bool + depends on VT && !UML +diff --git a/fs/exec.c b/fs/exec.c +index 65eaacaba4f4..1d3b310bd5f0 100644 +--- a/fs/exec.c ++++ b/fs/exec.c +@@ -63,6 +63,8 @@ + #include + #include + ++#include ++ + #include + #include + #include +@@ -866,9 +868,12 @@ static struct file *do_open_execat(int fd, struct filename *name, int flags) + if (err) + goto exit; + +- if (name->name[0] != '\0') ++ if (name->name[0] != '\0') { + fsnotify_open(file); + ++ trace_open_exec(name->name); ++ } ++ + out: + return file; + +diff --git a/fs/open.c b/fs/open.c +index cb81623a8b09..a92b0f6061ac 100644 +--- a/fs/open.c ++++ b/fs/open.c +@@ -34,6 +34,9 @@ + + #include "internal.h" + ++#define CREATE_TRACE_POINTS ++#include ++ + int do_truncate(struct dentry *dentry, loff_t length, unsigned int time_attrs, + struct file *filp) + { +@@ -1068,6 +1071,7 @@ long do_sys_open(int dfd, const char __user *filename, int flags, umode_t mode) + } else { + fsnotify_open(f); + fd_install(fd, f); ++ trace_do_sys_open(tmp->name, flags, mode); + } + } + putname(tmp); +diff --git a/include/trace/events/fs.h b/include/trace/events/fs.h +new file mode 100644 +index 000000000000..fb634b74adf3 +--- /dev/null ++++ b/include/trace/events/fs.h +@@ -0,0 +1,53 @@ ++#undef TRACE_SYSTEM ++#define TRACE_SYSTEM fs ++ ++#if !defined(_TRACE_FS_H) || defined(TRACE_HEADER_MULTI_READ) ++#define _TRACE_FS_H ++ ++#include ++#include ++ ++TRACE_EVENT(do_sys_open, ++ ++ TP_PROTO(const char *filename, int flags, int mode), ++ ++ TP_ARGS(filename, flags, mode), ++ ++ TP_STRUCT__entry( ++ __string( filename, filename ) ++ __field( int, flags ) ++ __field( int, mode ) ++ ), ++ ++ TP_fast_assign( ++ __assign_str(filename, filename); ++ __entry->flags = flags; ++ __entry->mode = mode; ++ ), ++ ++ TP_printk("\"%s\" %x %o", ++ __get_str(filename), __entry->flags, __entry->mode) ++); ++ ++TRACE_EVENT(open_exec, ++ ++ TP_PROTO(const char *filename), ++ ++ TP_ARGS(filename), ++ ++ TP_STRUCT__entry( ++ __string( filename, filename ) ++ ), ++ ++ TP_fast_assign( ++ __assign_str(filename, filename); ++ ), ++ ++ TP_printk("\"%s\"", ++ __get_str(filename)) ++); ++ ++#endif /* _TRACE_FS_H */ ++ ++/* This part must be outside protection */ ++#include +diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h +index 79226ca8f80f..2a30060e7e1d 100644 +--- a/include/linux/blkdev.h ++++ b/include/linux/blkdev.h +@@ -47,7 +47,11 @@ struct blk_queue_stats; + struct blk_stat_callback; + + #define BLKDEV_MIN_RQ 4 ++#ifdef CONFIG_ZENIFY ++#define BLKDEV_MAX_RQ 512 ++#else + #define BLKDEV_MAX_RQ 128 /* Default maximum */ ++#endif + + /* Must be consistent with blk_mq_poll_stats_bkt() */ + #define BLK_MQ_POLL_STATS_BKTS 16 +diff --git a/include/linux/thinkpad_ec.h b/include/linux/thinkpad_ec.h +new file mode 100644 +index 000000000000..1b80d7ee5493 +--- /dev/null ++++ b/include/linux/thinkpad_ec.h +@@ -0,0 +1,47 @@ ++/* ++ * thinkpad_ec.h - interface to ThinkPad embedded controller LPC3 functions ++ * ++ * Copyright (C) 2005 Shem Multinymous ++ * ++ * This program is free software; you can redistribute it and/or modify ++ * it under the terms of the GNU General Public License as published by ++ * the Free Software Foundation; either version 2 of the License, or ++ * (at your option) any later version. ++ * ++ * This program is distributed in the hope that it will be useful, ++ * but WITHOUT ANY WARRANTY; without even the implied warranty of ++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ++ * GNU General Public License for more details. ++ * ++ * You should have received a copy of the GNU General Public License ++ * along with this program; if not, write to the Free Software ++ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA ++ */ ++ ++#ifndef _THINKPAD_EC_H ++#define _THINKPAD_EC_H ++ ++#ifdef __KERNEL__ ++ ++#define TP_CONTROLLER_ROW_LEN 16 ++ ++/* EC transactions input and output (possibly partial) vectors of 16 bytes. */ ++struct thinkpad_ec_row { ++ u16 mask; /* bitmap of which entries of val[] are meaningful */ ++ u8 val[TP_CONTROLLER_ROW_LEN]; ++}; ++ ++extern int __must_check thinkpad_ec_lock(void); ++extern int __must_check thinkpad_ec_try_lock(void); ++extern void thinkpad_ec_unlock(void); ++ ++extern int thinkpad_ec_read_row(const struct thinkpad_ec_row *args, ++ struct thinkpad_ec_row *data); ++extern int thinkpad_ec_try_read_row(const struct thinkpad_ec_row *args, ++ struct thinkpad_ec_row *mask); ++extern int thinkpad_ec_prefetch_row(const struct thinkpad_ec_row *args); ++extern void thinkpad_ec_invalidate(void); ++ ++ ++#endif /* __KERNEL */ ++#endif /* _THINKPAD_EC_H */ +diff --git a/include/uapi/linux/vt.h b/include/uapi/linux/vt.h +index e9d39c48520a..3bceead8da40 100644 +--- a/include/uapi/linux/vt.h ++++ b/include/uapi/linux/vt.h +@@ -3,12 +3,25 @@ + #define _UAPI_LINUX_VT_H + + ++/* ++ * We will make this definition solely for the purpose of making packages ++ * such as splashutils build, because they can not understand that ++ * NR_TTY_DEVICES is defined in the kernel configuration. ++ */ ++#ifndef CONFIG_NR_TTY_DEVICES ++#define CONFIG_NR_TTY_DEVICES 63 ++#endif ++ + /* + * These constants are also useful for user-level apps (e.g., VC + * resizing). + */ + #define MIN_NR_CONSOLES 1 /* must be at least 1 */ +-#define MAX_NR_CONSOLES 63 /* serial lines start at 64 */ ++/* ++ * NR_TTY_DEVICES: ++ * Value MUST be at least 12 and must never be higher then 63 ++ */ ++#define MAX_NR_CONSOLES CONFIG_NR_TTY_DEVICES /* serial lines start above this */ + /* Note: the ioctl VT_GETSTATE does not work for + consoles 16 and higher (since it returns a short) */ + +diff --git a/init/Kconfig b/init/Kconfig +index 041f3a022122..5ed70eb1ad3a 100644 +--- a/init/Kconfig ++++ b/init/Kconfig +@@ -45,6 +45,38 @@ config THREAD_INFO_IN_TASK + + menu "General setup" + ++config ZENIFY ++ bool "A selection of patches from Zen/Liquorix kernel and additional tweaks for a better gaming experience" ++ default y ++ help ++ Tunes the kernel for responsiveness at the cost of throughput and power usage. ++ ++ --- Virtual Memory Subsystem --------------------------- ++ ++ Mem dirty before bg writeback..: 10 % -> 20 % ++ Mem dirty before sync writeback: 20 % -> 50 % ++ ++ --- Block Layer ---------------------------------------- ++ ++ Queue depth...............: 128 -> 512 ++ Default MQ scheduler......: mq-deadline -> bfq ++ ++ --- CFS CPU Scheduler ---------------------------------- ++ ++ Scheduling latency.............: 6 -> 3 ms ++ Minimal granularity............: 0.75 -> 0.3 ms ++ Wakeup granularity.............: 1 -> 0.5 ms ++ CPU migration cost.............: 0.5 -> 0.25 ms ++ Bandwidth slice size...........: 5 -> 3 ms ++ Ondemand fine upscaling limit..: 95 % -> 85 % ++ ++ --- MuQSS CPU Scheduler -------------------------------- ++ ++ Scheduling interval............: 6 -> 3 ms ++ ISO task max realtime use......: 70 % -> 25 % ++ Ondemand coarse upscaling limit: 80 % -> 45 % ++ Ondemand fine upscaling limit..: 95 % -> 45 % ++ + config BROKEN + bool + +diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c +index 2f0a0be4d344..bada807c7e59 100644 +--- a/kernel/sched/fair.c ++++ b/kernel/sched/fair.c +@@ -37,8 +37,13 @@ + * + * (default: 6ms * (1 + ilog(ncpus)), units: nanoseconds) + */ ++#ifdef CONFIG_ZENIFY ++unsigned int sysctl_sched_latency = 3000000ULL; ++static unsigned int normalized_sysctl_sched_latency = 3000000ULL; ++#else + unsigned int sysctl_sched_latency = 6000000ULL; + static unsigned int normalized_sysctl_sched_latency = 6000000ULL; ++#endif + + /* + * The initial- and re-scaling of tunables is configurable +@@ -58,13 +63,22 @@ enum sched_tunable_scaling sysctl_sched_tunable_scaling = SCHED_TUNABLESCALING_L + * + * (default: 0.75 msec * (1 + ilog(ncpus)), units: nanoseconds) + */ ++#ifdef CONFIG_ZENIFY ++unsigned int sysctl_sched_min_granularity = 300000ULL; ++static unsigned int normalized_sysctl_sched_min_granularity = 300000ULL; ++#else + unsigned int sysctl_sched_min_granularity = 750000ULL; + static unsigned int normalized_sysctl_sched_min_granularity = 750000ULL; ++#endif + + /* + * This value is kept at sysctl_sched_latency/sysctl_sched_min_granularity + */ ++#ifdef CONFIG_ZENIFY ++static unsigned int sched_nr_latency = 10; ++#else + static unsigned int sched_nr_latency = 8; ++#endif + + /* + * After fork, child runs first. If set to 0 (default) then +@@ -81,10 +95,17 @@ unsigned int sysctl_sched_child_runs_first __read_mostly; + * + * (default: 1 msec * (1 + ilog(ncpus)), units: nanoseconds) + */ ++#ifdef CONFIG_ZENIFY ++unsigned int sysctl_sched_wakeup_granularity = 500000UL; ++static unsigned int normalized_sysctl_sched_wakeup_granularity = 500000UL; ++ ++const_debug unsigned int sysctl_sched_migration_cost = 50000UL; ++#else + unsigned int sysctl_sched_wakeup_granularity = 1000000UL; + static unsigned int normalized_sysctl_sched_wakeup_granularity = 1000000UL; + + const_debug unsigned int sysctl_sched_migration_cost = 500000UL; ++#endif + + #ifdef CONFIG_SMP + /* +@@ -107,8 +128,12 @@ int __weak arch_asym_cpu_priority(int cpu) + * + * (default: 5 msec, units: microseconds) + */ ++#ifdef CONFIG_ZENIFY ++unsigned int sysctl_sched_cfs_bandwidth_slice = 3000UL; ++#else + unsigned int sysctl_sched_cfs_bandwidth_slice = 5000UL; + #endif ++#endif + + /* + * The margin used when comparing utilization with CPU capacity: +diff --git a/mm/page-writeback.c b/mm/page-writeback.c +index 337c6afb3345..9315e358f292 100644 +--- a/mm/page-writeback.c ++++ b/mm/page-writeback.c +@@ -71,7 +71,11 @@ static long ratelimit_pages = 32; + /* + * Start background writeback (via writeback threads) at this percentage + */ ++#ifdef CONFIG_ZENIFY ++int dirty_background_ratio = 20; ++#else + int dirty_background_ratio = 10; ++#endif + + /* + * dirty_background_bytes starts at 0 (disabled) so that it is a function of +@@ -88,7 +92,11 @@ int vm_highmem_is_dirtyable; + /* + * The generator of dirty data starts writeback at this percentage + */ ++#ifdef CONFIG_ZENIFY ++int vm_dirty_ratio = 50; ++#else + int vm_dirty_ratio = 20; ++#endif + + /* + * vm_dirty_bytes starts at 0 (disabled) so that it is a function of +diff --git a/net/ipv4/Kconfig b/net/ipv4/Kconfig +index 80dad301361d..42b7fa7d01f8 100644 +--- a/net/ipv4/Kconfig ++++ b/net/ipv4/Kconfig +@@ -702,6 +702,9 @@ choice + config DEFAULT_VEGAS + bool "Vegas" if TCP_CONG_VEGAS=y + ++ config DEFAULT_YEAH ++ bool "YeAH" if TCP_CONG_YEAH=y ++ + config DEFAULT_VENO + bool "Veno" if TCP_CONG_VENO=y + +@@ -735,6 +738,7 @@ config DEFAULT_TCP_CONG + default "htcp" if DEFAULT_HTCP + default "hybla" if DEFAULT_HYBLA + default "vegas" if DEFAULT_VEGAS ++ default "yeah" if DEFAULT_YEAH + default "westwood" if DEFAULT_WESTWOOD + default "veno" if DEFAULT_VENO + default "reno" if DEFAULT_RENO + +From: Nick Desaulniers +Date: Mon, 24 Dec 2018 13:37:41 +0200 +Subject: include/linux/compiler*.h: define asm_volatile_goto + +asm_volatile_goto should also be defined for other compilers that +support asm goto. + +Fixes commit 815f0dd ("include/linux/compiler*.h: make compiler-*.h +mutually exclusive"). + +Signed-off-by: Nick Desaulniers +Signed-off-by: Miguel Ojeda + +diff --git a/include/linux/compiler_types.h b/include/linux/compiler_types.h +index ba814f1..e77eeb0 100644 +--- a/include/linux/compiler_types.h ++++ b/include/linux/compiler_types.h +@@ -188,6 +188,10 @@ struct ftrace_likely_data { + #define asm_volatile_goto(x...) asm goto(x) + #endif + ++#ifndef asm_volatile_goto ++#define asm_volatile_goto(x...) asm goto(x) ++#endif ++ + /* Are two types/vars the same type (ignoring qualifiers)? */ + #define __same_type(a, b) __builtin_types_compatible_p(typeof(a), typeof(b)) + +From: Andy Lavr +Date: Mon, 24 Dec 2018 14:57:47 +0200 +Subject: avl: Use [defer+madvise] as default khugepaged defrag strategy + +For some reason, the default strategy to respond to THP fault fallbacks +is still just madvise, meaning stall if the program wants transparent +hugepages, but don't trigger a background reclaim / compaction if THP +begins to fail allocations. This creates a snowball affect where we +still use the THP code paths, but we almost always fail once a system +has been active and busy for a while. + +The option "defer" was created for interactive systems where THP can +still improve performance. If we have to fallback to a regular page due +to an allocation failure or anything else, we will trigger a background +reclaim and compaction so future THP attempts succeed and previous +attempts eventually have their smaller pages combined without stalling +running applications. + +We still want madvise to stall applications that explicitely want THP, +so defer+madvise _does_ make a ton of sense. Make it the default for +interactive systems, especially if the kernel maintainer left +transparent hugepages on "always". + +Reasoning and details in the original patch: +https://lwn.net/Articles/711248/ + +Signed-off-by: Andy Lavr + +diff --git a/mm/huge_memory.c b/mm/huge_memory.c +index e84a10b..21d62b7 100644 +--- a/mm/huge_memory.c ++++ b/mm/huge_memory.c +@@ -53,7 +53,11 @@ unsigned long transparent_hugepage_flags __read_mostly = + #ifdef CONFIG_TRANSPARENT_HUGEPAGE_MADVISE + (1< +Date: Mon, 25 Nov 2019 15:13:06 -0300 +Subject: [PATCH] elevator: set default scheduler to bfq for blk-mq + +Signed-off-by: Alexandre Frade +--- + block/elevator.c | 6 +++--- + 1 file changed, 3 insertions(+), 3 deletions(-) + +diff --git a/block/elevator.c b/block/elevator.c +index 076ba7308e65..81f89095aa77 100644 +--- a/block/elevator.c ++++ b/block/elevator.c +@@ -623,15 +623,15 @@ static inline bool elv_support_iosched(struct request_queue *q) + } + + /* +- * For single queue devices, default to using mq-deadline. If we have multiple +- * queues or mq-deadline is not available, default to "none". ++ * For single queue devices, default to using bfq. If we have multiple ++ * queues or bfq is not available, default to "none". + */ + static struct elevator_type *elevator_get_default(struct request_queue *q) + { + if (q->nr_hw_queues != 1) + return NULL; + +- return elevator_get(q, "mq-deadline", false); ++ return elevator_get(q, "bfq", false); + } + + /* +From c3ec05777c46e19a8a26d0fc4ca0c0db8a19de97 Mon Sep 17 00:00:00 2001 +From: Alexandre Frade +Date: Fri, 10 May 2019 16:45:59 -0300 +Subject: [PATCH] block: set rq_affinity = 2 for full multithreading I/O + requests + +Signed-off-by: Alexandre Frade +--- + include/linux/blkdev.h | 3 ++- + 1 file changed, 2 insertions(+), 1 deletion(-) + +diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h +index f3ea78b0c91c..4dbacc6b073b 100644 +--- a/include/linux/blkdev.h ++++ b/include/linux/blkdev.h +@@ -621,7 +621,8 @@ struct request_queue { + #define QUEUE_FLAG_RQ_ALLOC_TIME 27 /* record rq->alloc_time_ns */ + + #define QUEUE_FLAG_MQ_DEFAULT ((1 << QUEUE_FLAG_IO_STAT) | \ +- (1 << QUEUE_FLAG_SAME_COMP)) ++ (1 << QUEUE_FLAG_SAME_COMP) | \ ++ (1 << QUEUE_FLAG_SAME_FORCE)) + + void blk_queue_flag_set(unsigned int flag, struct request_queue *q); + void blk_queue_flag_clear(unsigned int flag, struct request_queue *q); +From 8171d33d0b84a953649863538fdbe4c26c035e4f Mon Sep 17 00:00:00 2001 +From: Alexandre Frade +Date: Fri, 10 May 2019 14:32:50 -0300 +Subject: [PATCH] mm: set 2 megabytes for address_space-level file read-ahead + pages size + +Signed-off-by: Alexandre Frade +--- + include/linux/mm.h | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +diff --git a/include/linux/mm.h b/include/linux/mm.h +index a2adf95b3f9c..e804d9f7583a 100644 +--- a/include/linux/mm.h ++++ b/include/linux/mm.h +@@ -2416,7 +2416,7 @@ int __must_check write_one_page(struct page *page); + void task_dirty_inc(struct task_struct *tsk); + + /* readahead.c */ +-#define VM_READAHEAD_PAGES (SZ_128K / PAGE_SIZE) ++#define VM_READAHEAD_PAGES (SZ_2M / PAGE_SIZE) + + int force_page_cache_readahead(struct address_space *mapping, struct file *filp, + pgoff_t offset, unsigned long nr_to_read); +From de7119e3db9fdb4c704355854a02a7e9fad931d4 Mon Sep 17 00:00:00 2001 +From: Steven Barrett +Date: Wed, 15 Jan 2020 20:43:56 -0600 +Subject: [PATCH] ZEN: intel-pstate: Implement "enable" parameter + +If intel-pstate is compiled into the kernel, it will preempt the loading +of acpi-cpufreq so you can take advantage of hardware p-states without +any friction. + +However, intel-pstate is not completely superior to cpufreq's ondemand +for one reason. There's no concept of an up_threshold property. + +In ondemand, up_threshold essentially reduces the maximum utilization to +compare against, allowing you to hit max frequencies and turbo boost +from a much lower core utilization. + +With intel-pstate, you have the concept of minimum and maximum +performance, but no tunable that lets you define, maximum frequency +means 50% core utilization. For just this oversight, there's reasons +you may want ondemand. + +Lets support setting "enable" in kernel boot parameters. This lets +kernel maintainers include "intel_pstate=disable" statically in the +static boot parameters, but let users of the kernel override this +selection. +--- + Documentation/admin-guide/kernel-parameters.txt | 3 +++ + drivers/cpufreq/intel_pstate.c | 2 ++ + 2 files changed, 5 insertions(+) + +diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt +index ade4e6ec23e03..0b613370d28d8 100644 +--- a/Documentation/admin-guide/kernel-parameters.txt ++++ b/Documentation/admin-guide/kernel-parameters.txt +@@ -1765,6 +1765,9 @@ + disable + Do not enable intel_pstate as the default + scaling driver for the supported processors ++ enable ++ Enable intel_pstate in-case "disable" was passed ++ previously in the kernel boot parameters + passive + Use intel_pstate as a scaling driver, but configure it + to work with generic cpufreq governors (instead of +diff --git a/drivers/cpufreq/intel_pstate.c b/drivers/cpufreq/intel_pstate.c +index d2fa3e9ccd97c..bd10cb02fc0ff 100644 +--- a/drivers/cpufreq/intel_pstate.c ++++ b/drivers/cpufreq/intel_pstate.c +@@ -2826,6 +2826,8 @@ static int __init intel_pstate_setup(char *str) + pr_info("HWP disabled\n"); + no_hwp = 1; + } ++ if (!strcmp(str, "enable")) ++ no_load = 0; + if (!strcmp(str, "force")) + force_load = 1; + if (!strcmp(str, "hwp_only")) diff --git a/linux55-tkg/linux55-tkg-patches/0003-glitched-cfs.patch b/linux55-tkg/linux55-tkg-patches/0003-glitched-cfs.patch new file mode 100644 index 0000000..06b7f02 --- /dev/null +++ b/linux55-tkg/linux55-tkg-patches/0003-glitched-cfs.patch @@ -0,0 +1,72 @@ +diff --git a/kernel/Kconfig.hz b/kernel/Kconfig.hz +index 2a202a846757..1d9c7ed79b11 100644 +--- a/kernel/Kconfig.hz ++++ b/kernel/Kconfig.hz +@@ -4,7 +4,7 @@ + + choice + prompt "Timer frequency" +- default HZ_250 ++ default HZ_500 + help + Allows the configuration of the timer frequency. It is customary + to have the timer interrupt run at 1000 Hz but 100 Hz may be more +@@ -39,6 +39,13 @@ choice + on SMP and NUMA systems and exactly dividing by both PAL and + NTSC frame rates for video and multimedia work. + ++ config HZ_500 ++ bool "500 HZ" ++ help ++ 500 Hz is a balanced timer frequency. Provides fast interactivity ++ on desktops with great smoothness without increasing CPU power ++ consumption and sacrificing the battery life on laptops. ++ + config HZ_1000 + bool "1000 HZ" + help +@@ -52,6 +59,7 @@ config HZ + default 100 if HZ_100 + default 250 if HZ_250 + default 300 if HZ_300 ++ default 500 if HZ_500 + default 1000 if HZ_1000 + + config SCHED_HRTICK + +diff --git a/kernel/Kconfig.hz b/kernel/Kconfig.hz +index 2a202a846757..1d9c7ed79b11 100644 +--- a/kernel/Kconfig.hz ++++ b/kernel/Kconfig.hz +@@ -4,7 +4,7 @@ + + choice + prompt "Timer frequency" +- default HZ_500 ++ default HZ_750 + help + Allows the configuration of the timer frequency. It is customary + to have the timer interrupt run at 1000 Hz but 100 Hz may be more +@@ -46,6 +46,13 @@ choice + on desktops with great smoothness without increasing CPU power + consumption and sacrificing the battery life on laptops. + ++ config HZ_750 ++ bool "750 HZ" ++ help ++ 750 Hz is a good timer frequency for desktops. Provides fast ++ interactivity with great smoothness without sacrificing too ++ much throughput. ++ + config HZ_1000 + bool "1000 HZ" + help +@@ -60,6 +67,7 @@ config HZ + default 250 if HZ_250 + default 300 if HZ_300 + default 500 if HZ_500 ++ default 750 if HZ_750 + default 1000 if HZ_1000 + + config SCHED_HRTICK + diff --git a/linux55-tkg/linux55-tkg-patches/0004-5.5-ck1.patch b/linux55-tkg/linux55-tkg-patches/0004-5.5-ck1.patch new file mode 100644 index 0000000..f10386d --- /dev/null +++ b/linux55-tkg/linux55-tkg-patches/0004-5.5-ck1.patch @@ -0,0 +1,12598 @@ +diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt +index ade4e6ec23e0..e219f958f055 100644 +--- a/Documentation/admin-guide/kernel-parameters.txt ++++ b/Documentation/admin-guide/kernel-parameters.txt +@@ -4307,6 +4307,14 @@ + Memory area to be used by remote processor image, + managed by CMA. + ++ rqshare= [X86] Select the MuQSS scheduler runqueue sharing type. ++ Format: ++ smt -- Share SMT (hyperthread) sibling runqueues ++ mc -- Share MC (multicore) sibling runqueues ++ smp -- Share SMP runqueues ++ none -- So not share any runqueues ++ Default value is mc ++ + rw [KNL] Mount root device read-write on boot + + S [KNL] Run init in single mode +diff --git a/Documentation/admin-guide/sysctl/kernel.rst b/Documentation/admin-guide/sysctl/kernel.rst +index def074807cee..7f00e3a1276b 100644 +--- a/Documentation/admin-guide/sysctl/kernel.rst ++++ b/Documentation/admin-guide/sysctl/kernel.rst +@@ -46,6 +46,7 @@ show up in /proc/sys/kernel: + - hung_task_check_interval_secs + - hung_task_warnings + - hyperv_record_panic_msg ++- iso_cpu + - kexec_load_disabled + - kptr_restrict + - l2cr [ PPC only ] +@@ -82,6 +83,7 @@ show up in /proc/sys/kernel: + - randomize_va_space + - real-root-dev ==> Documentation/admin-guide/initrd.rst + - reboot-cmd [ SPARC only ] ++- rr_interval + - rtsig-max + - rtsig-nr + - sched_energy_aware +@@ -105,6 +107,7 @@ show up in /proc/sys/kernel: + - unknown_nmi_panic + - watchdog + - watchdog_thresh ++- yield_type + - version + + +@@ -438,6 +441,16 @@ When kptr_restrict is set to (2), kernel pointers printed using + %pK will be replaced with 0's regardless of privileges. + + ++iso_cpu: (MuQSS CPU scheduler only) ++=================================== ++ ++This sets the percentage cpu that the unprivileged SCHED_ISO tasks can ++run effectively at realtime priority, averaged over a rolling five ++seconds over the -whole- system, meaning all cpus. ++ ++Set to 70 (percent) by default. ++ ++ + l2cr: (PPC only) + ================ + +@@ -907,6 +920,20 @@ ROM/Flash boot loader. Maybe to tell it what to do after + rebooting. ??? + + ++rr_interval: (MuQSS CPU scheduler only) ++======================================= ++ ++This is the smallest duration that any cpu process scheduling unit ++will run for. Increasing this value can increase throughput of cpu ++bound tasks substantially but at the expense of increased latencies ++overall. Conversely decreasing it will decrease average and maximum ++latencies but at the expense of throughput. This value is in ++milliseconds and the default value chosen depends on the number of ++cpus available at scheduler initialisation with a minimum of 6. ++ ++Valid values are from 1-1000. ++ ++ + rtsig-max & rtsig-nr: + ===================== + +@@ -1173,3 +1200,13 @@ is 10 seconds. + + The softlockup threshold is (2 * watchdog_thresh). Setting this + tunable to zero will disable lockup detection altogether. ++ ++ ++yield_type: (MuQSS CPU scheduler only) ++====================================== ++ ++This determines what type of yield calls to sched_yield will perform. ++ ++ 0: No yield. ++ 1: Yield only to better priority/deadline tasks. (default) ++ 2: Expire timeslice and recalculate deadline. +diff --git a/Documentation/scheduler/sched-BFS.txt b/Documentation/scheduler/sched-BFS.txt +new file mode 100644 +index 000000000000..c0282002a079 +--- /dev/null ++++ b/Documentation/scheduler/sched-BFS.txt +@@ -0,0 +1,351 @@ ++BFS - The Brain Fuck Scheduler by Con Kolivas. ++ ++Goals. ++ ++The goal of the Brain Fuck Scheduler, referred to as BFS from here on, is to ++completely do away with the complex designs of the past for the cpu process ++scheduler and instead implement one that is very simple in basic design. ++The main focus of BFS is to achieve excellent desktop interactivity and ++responsiveness without heuristics and tuning knobs that are difficult to ++understand, impossible to model and predict the effect of, and when tuned to ++one workload cause massive detriment to another. ++ ++ ++Design summary. ++ ++BFS is best described as a single runqueue, O(n) lookup, earliest effective ++virtual deadline first design, loosely based on EEVDF (earliest eligible virtual ++deadline first) and my previous Staircase Deadline scheduler. Each component ++shall be described in order to understand the significance of, and reasoning for ++it. The codebase when the first stable version was released was approximately ++9000 lines less code than the existing mainline linux kernel scheduler (in ++2.6.31). This does not even take into account the removal of documentation and ++the cgroups code that is not used. ++ ++Design reasoning. ++ ++The single runqueue refers to the queued but not running processes for the ++entire system, regardless of the number of CPUs. The reason for going back to ++a single runqueue design is that once multiple runqueues are introduced, ++per-CPU or otherwise, there will be complex interactions as each runqueue will ++be responsible for the scheduling latency and fairness of the tasks only on its ++own runqueue, and to achieve fairness and low latency across multiple CPUs, any ++advantage in throughput of having CPU local tasks causes other disadvantages. ++This is due to requiring a very complex balancing system to at best achieve some ++semblance of fairness across CPUs and can only maintain relatively low latency ++for tasks bound to the same CPUs, not across them. To increase said fairness ++and latency across CPUs, the advantage of local runqueue locking, which makes ++for better scalability, is lost due to having to grab multiple locks. ++ ++A significant feature of BFS is that all accounting is done purely based on CPU ++used and nowhere is sleep time used in any way to determine entitlement or ++interactivity. Interactivity "estimators" that use some kind of sleep/run ++algorithm are doomed to fail to detect all interactive tasks, and to falsely tag ++tasks that aren't interactive as being so. The reason for this is that it is ++close to impossible to determine that when a task is sleeping, whether it is ++doing it voluntarily, as in a userspace application waiting for input in the ++form of a mouse click or otherwise, or involuntarily, because it is waiting for ++another thread, process, I/O, kernel activity or whatever. Thus, such an ++estimator will introduce corner cases, and more heuristics will be required to ++cope with those corner cases, introducing more corner cases and failed ++interactivity detection and so on. Interactivity in BFS is built into the design ++by virtue of the fact that tasks that are waking up have not used up their quota ++of CPU time, and have earlier effective deadlines, thereby making it very likely ++they will preempt any CPU bound task of equivalent nice level. See below for ++more information on the virtual deadline mechanism. Even if they do not preempt ++a running task, because the rr interval is guaranteed to have a bound upper ++limit on how long a task will wait for, it will be scheduled within a timeframe ++that will not cause visible interface jitter. ++ ++ ++Design details. ++ ++Task insertion. ++ ++BFS inserts tasks into each relevant queue as an O(1) insertion into a double ++linked list. On insertion, *every* running queue is checked to see if the newly ++queued task can run on any idle queue, or preempt the lowest running task on the ++system. This is how the cross-CPU scheduling of BFS achieves significantly lower ++latency per extra CPU the system has. In this case the lookup is, in the worst ++case scenario, O(n) where n is the number of CPUs on the system. ++ ++Data protection. ++ ++BFS has one single lock protecting the process local data of every task in the ++global queue. Thus every insertion, removal and modification of task data in the ++global runqueue needs to grab the global lock. However, once a task is taken by ++a CPU, the CPU has its own local data copy of the running process' accounting ++information which only that CPU accesses and modifies (such as during a ++timer tick) thus allowing the accounting data to be updated lockless. Once a ++CPU has taken a task to run, it removes it from the global queue. Thus the ++global queue only ever has, at most, ++ ++ (number of tasks requesting cpu time) - (number of logical CPUs) + 1 ++ ++tasks in the global queue. This value is relevant for the time taken to look up ++tasks during scheduling. This will increase if many tasks with CPU affinity set ++in their policy to limit which CPUs they're allowed to run on if they outnumber ++the number of CPUs. The +1 is because when rescheduling a task, the CPU's ++currently running task is put back on the queue. Lookup will be described after ++the virtual deadline mechanism is explained. ++ ++Virtual deadline. ++ ++The key to achieving low latency, scheduling fairness, and "nice level" ++distribution in BFS is entirely in the virtual deadline mechanism. The one ++tunable in BFS is the rr_interval, or "round robin interval". This is the ++maximum time two SCHED_OTHER (or SCHED_NORMAL, the common scheduling policy) ++tasks of the same nice level will be running for, or looking at it the other ++way around, the longest duration two tasks of the same nice level will be ++delayed for. When a task requests cpu time, it is given a quota (time_slice) ++equal to the rr_interval and a virtual deadline. The virtual deadline is ++offset from the current time in jiffies by this equation: ++ ++ jiffies + (prio_ratio * rr_interval) ++ ++The prio_ratio is determined as a ratio compared to the baseline of nice -20 ++and increases by 10% per nice level. The deadline is a virtual one only in that ++no guarantee is placed that a task will actually be scheduled by this time, but ++it is used to compare which task should go next. There are three components to ++how a task is next chosen. First is time_slice expiration. If a task runs out ++of its time_slice, it is descheduled, the time_slice is refilled, and the ++deadline reset to that formula above. Second is sleep, where a task no longer ++is requesting CPU for whatever reason. The time_slice and deadline are _not_ ++adjusted in this case and are just carried over for when the task is next ++scheduled. Third is preemption, and that is when a newly waking task is deemed ++higher priority than a currently running task on any cpu by virtue of the fact ++that it has an earlier virtual deadline than the currently running task. The ++earlier deadline is the key to which task is next chosen for the first and ++second cases. Once a task is descheduled, it is put back on the queue, and an ++O(n) lookup of all queued-but-not-running tasks is done to determine which has ++the earliest deadline and that task is chosen to receive CPU next. ++ ++The CPU proportion of different nice tasks works out to be approximately the ++ ++ (prio_ratio difference)^2 ++ ++The reason it is squared is that a task's deadline does not change while it is ++running unless it runs out of time_slice. Thus, even if the time actually ++passes the deadline of another task that is queued, it will not get CPU time ++unless the current running task deschedules, and the time "base" (jiffies) is ++constantly moving. ++ ++Task lookup. ++ ++BFS has 103 priority queues. 100 of these are dedicated to the static priority ++of realtime tasks, and the remaining 3 are, in order of best to worst priority, ++SCHED_ISO (isochronous), SCHED_NORMAL, and SCHED_IDLEPRIO (idle priority ++scheduling). When a task of these priorities is queued, a bitmap of running ++priorities is set showing which of these priorities has tasks waiting for CPU ++time. When a CPU is made to reschedule, the lookup for the next task to get ++CPU time is performed in the following way: ++ ++First the bitmap is checked to see what static priority tasks are queued. If ++any realtime priorities are found, the corresponding queue is checked and the ++first task listed there is taken (provided CPU affinity is suitable) and lookup ++is complete. If the priority corresponds to a SCHED_ISO task, they are also ++taken in FIFO order (as they behave like SCHED_RR). If the priority corresponds ++to either SCHED_NORMAL or SCHED_IDLEPRIO, then the lookup becomes O(n). At this ++stage, every task in the runlist that corresponds to that priority is checked ++to see which has the earliest set deadline, and (provided it has suitable CPU ++affinity) it is taken off the runqueue and given the CPU. If a task has an ++expired deadline, it is taken and the rest of the lookup aborted (as they are ++chosen in FIFO order). ++ ++Thus, the lookup is O(n) in the worst case only, where n is as described ++earlier, as tasks may be chosen before the whole task list is looked over. ++ ++ ++Scalability. ++ ++The major limitations of BFS will be that of scalability, as the separate ++runqueue designs will have less lock contention as the number of CPUs rises. ++However they do not scale linearly even with separate runqueues as multiple ++runqueues will need to be locked concurrently on such designs to be able to ++achieve fair CPU balancing, to try and achieve some sort of nice-level fairness ++across CPUs, and to achieve low enough latency for tasks on a busy CPU when ++other CPUs would be more suited. BFS has the advantage that it requires no ++balancing algorithm whatsoever, as balancing occurs by proxy simply because ++all CPUs draw off the global runqueue, in priority and deadline order. Despite ++the fact that scalability is _not_ the prime concern of BFS, it both shows very ++good scalability to smaller numbers of CPUs and is likely a more scalable design ++at these numbers of CPUs. ++ ++It also has some very low overhead scalability features built into the design ++when it has been deemed their overhead is so marginal that they're worth adding. ++The first is the local copy of the running process' data to the CPU it's running ++on to allow that data to be updated lockless where possible. Then there is ++deference paid to the last CPU a task was running on, by trying that CPU first ++when looking for an idle CPU to use the next time it's scheduled. Finally there ++is the notion of cache locality beyond the last running CPU. The sched_domains ++information is used to determine the relative virtual "cache distance" that ++other CPUs have from the last CPU a task was running on. CPUs with shared ++caches, such as SMT siblings, or multicore CPUs with shared caches, are treated ++as cache local. CPUs without shared caches are treated as not cache local, and ++CPUs on different NUMA nodes are treated as very distant. This "relative cache ++distance" is used by modifying the virtual deadline value when doing lookups. ++Effectively, the deadline is unaltered between "cache local" CPUs, doubled for ++"cache distant" CPUs, and quadrupled for "very distant" CPUs. The reasoning ++behind the doubling of deadlines is as follows. The real cost of migrating a ++task from one CPU to another is entirely dependant on the cache footprint of ++the task, how cache intensive the task is, how long it's been running on that ++CPU to take up the bulk of its cache, how big the CPU cache is, how fast and ++how layered the CPU cache is, how fast a context switch is... and so on. In ++other words, it's close to random in the real world where we do more than just ++one sole workload. The only thing we can be sure of is that it's not free. So ++BFS uses the principle that an idle CPU is a wasted CPU and utilising idle CPUs ++is more important than cache locality, and cache locality only plays a part ++after that. Doubling the effective deadline is based on the premise that the ++"cache local" CPUs will tend to work on the same tasks up to double the number ++of cache local CPUs, and once the workload is beyond that amount, it is likely ++that none of the tasks are cache warm anywhere anyway. The quadrupling for NUMA ++is a value I pulled out of my arse. ++ ++When choosing an idle CPU for a waking task, the cache locality is determined ++according to where the task last ran and then idle CPUs are ranked from best ++to worst to choose the most suitable idle CPU based on cache locality, NUMA ++node locality and hyperthread sibling business. They are chosen in the ++following preference (if idle): ++ ++* Same core, idle or busy cache, idle threads ++* Other core, same cache, idle or busy cache, idle threads. ++* Same node, other CPU, idle cache, idle threads. ++* Same node, other CPU, busy cache, idle threads. ++* Same core, busy threads. ++* Other core, same cache, busy threads. ++* Same node, other CPU, busy threads. ++* Other node, other CPU, idle cache, idle threads. ++* Other node, other CPU, busy cache, idle threads. ++* Other node, other CPU, busy threads. ++ ++This shows the SMT or "hyperthread" awareness in the design as well which will ++choose a real idle core first before a logical SMT sibling which already has ++tasks on the physical CPU. ++ ++Early benchmarking of BFS suggested scalability dropped off at the 16 CPU mark. ++However this benchmarking was performed on an earlier design that was far less ++scalable than the current one so it's hard to know how scalable it is in terms ++of both CPUs (due to the global runqueue) and heavily loaded machines (due to ++O(n) lookup) at this stage. Note that in terms of scalability, the number of ++_logical_ CPUs matters, not the number of _physical_ CPUs. Thus, a dual (2x) ++quad core (4X) hyperthreaded (2X) machine is effectively a 16X. Newer benchmark ++results are very promising indeed, without needing to tweak any knobs, features ++or options. Benchmark contributions are most welcome. ++ ++ ++Features ++ ++As the initial prime target audience for BFS was the average desktop user, it ++was designed to not need tweaking, tuning or have features set to obtain benefit ++from it. Thus the number of knobs and features has been kept to an absolute ++minimum and should not require extra user input for the vast majority of cases. ++There are precisely 2 tunables, and 2 extra scheduling policies. The rr_interval ++and iso_cpu tunables, and the SCHED_ISO and SCHED_IDLEPRIO policies. In addition ++to this, BFS also uses sub-tick accounting. What BFS does _not_ now feature is ++support for CGROUPS. The average user should neither need to know what these ++are, nor should they need to be using them to have good desktop behaviour. ++ ++rr_interval ++ ++There is only one "scheduler" tunable, the round robin interval. This can be ++accessed in ++ ++ /proc/sys/kernel/rr_interval ++ ++The value is in milliseconds, and the default value is set to 6 on a ++uniprocessor machine, and automatically set to a progressively higher value on ++multiprocessor machines. The reasoning behind increasing the value on more CPUs ++is that the effective latency is decreased by virtue of there being more CPUs on ++BFS (for reasons explained above), and increasing the value allows for less ++cache contention and more throughput. Valid values are from 1 to 1000 ++Decreasing the value will decrease latencies at the cost of decreasing ++throughput, while increasing it will improve throughput, but at the cost of ++worsening latencies. The accuracy of the rr interval is limited by HZ resolution ++of the kernel configuration. Thus, the worst case latencies are usually slightly ++higher than this actual value. The default value of 6 is not an arbitrary one. ++It is based on the fact that humans can detect jitter at approximately 7ms, so ++aiming for much lower latencies is pointless under most circumstances. It is ++worth noting this fact when comparing the latency performance of BFS to other ++schedulers. Worst case latencies being higher than 7ms are far worse than ++average latencies not being in the microsecond range. ++ ++Isochronous scheduling. ++ ++Isochronous scheduling is a unique scheduling policy designed to provide ++near-real-time performance to unprivileged (ie non-root) users without the ++ability to starve the machine indefinitely. Isochronous tasks (which means ++"same time") are set using, for example, the schedtool application like so: ++ ++ schedtool -I -e amarok ++ ++This will start the audio application "amarok" as SCHED_ISO. How SCHED_ISO works ++is that it has a priority level between true realtime tasks and SCHED_NORMAL ++which would allow them to preempt all normal tasks, in a SCHED_RR fashion (ie, ++if multiple SCHED_ISO tasks are running, they purely round robin at rr_interval ++rate). However if ISO tasks run for more than a tunable finite amount of time, ++they are then demoted back to SCHED_NORMAL scheduling. This finite amount of ++time is the percentage of _total CPU_ available across the machine, configurable ++as a percentage in the following "resource handling" tunable (as opposed to a ++scheduler tunable): ++ ++ /proc/sys/kernel/iso_cpu ++ ++and is set to 70% by default. It is calculated over a rolling 5 second average ++Because it is the total CPU available, it means that on a multi CPU machine, it ++is possible to have an ISO task running as realtime scheduling indefinitely on ++just one CPU, as the other CPUs will be available. Setting this to 100 is the ++equivalent of giving all users SCHED_RR access and setting it to 0 removes the ++ability to run any pseudo-realtime tasks. ++ ++A feature of BFS is that it detects when an application tries to obtain a ++realtime policy (SCHED_RR or SCHED_FIFO) and the caller does not have the ++appropriate privileges to use those policies. When it detects this, it will ++give the task SCHED_ISO policy instead. Thus it is transparent to the user. ++Because some applications constantly set their policy as well as their nice ++level, there is potential for them to undo the override specified by the user ++on the command line of setting the policy to SCHED_ISO. To counter this, once ++a task has been set to SCHED_ISO policy, it needs superuser privileges to set ++it back to SCHED_NORMAL. This will ensure the task remains ISO and all child ++processes and threads will also inherit the ISO policy. ++ ++Idleprio scheduling. ++ ++Idleprio scheduling is a scheduling policy designed to give out CPU to a task ++_only_ when the CPU would be otherwise idle. The idea behind this is to allow ++ultra low priority tasks to be run in the background that have virtually no ++effect on the foreground tasks. This is ideally suited to distributed computing ++clients (like setiathome, folding, mprime etc) but can also be used to start ++a video encode or so on without any slowdown of other tasks. To avoid this ++policy from grabbing shared resources and holding them indefinitely, if it ++detects a state where the task is waiting on I/O, the machine is about to ++suspend to ram and so on, it will transiently schedule them as SCHED_NORMAL. As ++per the Isochronous task management, once a task has been scheduled as IDLEPRIO, ++it cannot be put back to SCHED_NORMAL without superuser privileges. Tasks can ++be set to start as SCHED_IDLEPRIO with the schedtool command like so: ++ ++ schedtool -D -e ./mprime ++ ++Subtick accounting. ++ ++It is surprisingly difficult to get accurate CPU accounting, and in many cases, ++the accounting is done by simply determining what is happening at the precise ++moment a timer tick fires off. This becomes increasingly inaccurate as the ++timer tick frequency (HZ) is lowered. It is possible to create an application ++which uses almost 100% CPU, yet by being descheduled at the right time, records ++zero CPU usage. While the main problem with this is that there are possible ++security implications, it is also difficult to determine how much CPU a task ++really does use. BFS tries to use the sub-tick accounting from the TSC clock, ++where possible, to determine real CPU usage. This is not entirely reliable, but ++is far more likely to produce accurate CPU usage data than the existing designs ++and will not show tasks as consuming no CPU usage when they actually are. Thus, ++the amount of CPU reported as being used by BFS will more accurately represent ++how much CPU the task itself is using (as is shown for example by the 'time' ++application), so the reported values may be quite different to other schedulers. ++Values reported as the 'load' are more prone to problems with this design, but ++per process values are closer to real usage. When comparing throughput of BFS ++to other designs, it is important to compare the actual completed work in terms ++of total wall clock time taken and total work done, rather than the reported ++"cpu usage". ++ ++ ++Con Kolivas Fri Aug 27 2010 +diff --git a/Documentation/scheduler/sched-MuQSS.txt b/Documentation/scheduler/sched-MuQSS.txt +new file mode 100644 +index 000000000000..ae28b85c9995 +--- /dev/null ++++ b/Documentation/scheduler/sched-MuQSS.txt +@@ -0,0 +1,373 @@ ++MuQSS - The Multiple Queue Skiplist Scheduler by Con Kolivas. ++ ++MuQSS is a per-cpu runqueue variant of the original BFS scheduler with ++one 8 level skiplist per runqueue, and fine grained locking for much more ++scalability. ++ ++ ++Goals. ++ ++The goal of the Multiple Queue Skiplist Scheduler, referred to as MuQSS from ++here on (pronounced mux) is to completely do away with the complex designs of ++the past for the cpu process scheduler and instead implement one that is very ++simple in basic design. The main focus of MuQSS is to achieve excellent desktop ++interactivity and responsiveness without heuristics and tuning knobs that are ++difficult to understand, impossible to model and predict the effect of, and when ++tuned to one workload cause massive detriment to another, while still being ++scalable to many CPUs and processes. ++ ++ ++Design summary. ++ ++MuQSS is best described as per-cpu multiple runqueue, O(log n) insertion, O(1) ++lookup, earliest effective virtual deadline first tickless design, loosely based ++on EEVDF (earliest eligible virtual deadline first) and my previous Staircase ++Deadline scheduler, and evolved from the single runqueue O(n) BFS scheduler. ++Each component shall be described in order to understand the significance of, ++and reasoning for it. ++ ++ ++Design reasoning. ++ ++In BFS, the use of a single runqueue across all CPUs meant that each CPU would ++need to scan the entire runqueue looking for the process with the earliest ++deadline and schedule that next, regardless of which CPU it originally came ++from. This made BFS deterministic with respect to latency and provided ++guaranteed latencies dependent on number of processes and CPUs. The single ++runqueue, however, meant that all CPUs would compete for the single lock ++protecting it, which would lead to increasing lock contention as the number of ++CPUs rose and appeared to limit scalability of common workloads beyond 16 ++logical CPUs. Additionally, the O(n) lookup of the runqueue list obviously ++increased overhead proportionate to the number of queued proecesses and led to ++cache thrashing while iterating over the linked list. ++ ++MuQSS is an evolution of BFS, designed to maintain the same scheduling ++decision mechanism and be virtually deterministic without relying on the ++constrained design of the single runqueue by splitting out the single runqueue ++to be per-CPU and use skiplists instead of linked lists. ++ ++The original reason for going back to a single runqueue design for BFS was that ++once multiple runqueues are introduced, per-CPU or otherwise, there will be ++complex interactions as each runqueue will be responsible for the scheduling ++latency and fairness of the tasks only on its own runqueue, and to achieve ++fairness and low latency across multiple CPUs, any advantage in throughput of ++having CPU local tasks causes other disadvantages. This is due to requiring a ++very complex balancing system to at best achieve some semblance of fairness ++across CPUs and can only maintain relatively low latency for tasks bound to the ++same CPUs, not across them. To increase said fairness and latency across CPUs, ++the advantage of local runqueue locking, which makes for better scalability, is ++lost due to having to grab multiple locks. ++ ++MuQSS works around the problems inherent in multiple runqueue designs by ++making its skip lists priority ordered and through novel use of lockless ++examination of each other runqueue it can decide if it should take the earliest ++deadline task from another runqueue for latency reasons, or for CPU balancing ++reasons. It still does not have a balancing system, choosing to allow the ++next task scheduling decision and task wakeup CPU choice to allow balancing to ++happen by virtue of its choices. ++ ++As a further evolution of the design, MuQSS normally configures sharing of ++runqueues in a logical fashion for when CPU resources are shared for improved ++latency and throughput. By default it shares runqueues and locks between ++multicore siblings. Optionally it can be configured to run with sharing of ++SMT siblings only, all SMP packages or no sharing at all. Additionally it can ++be selected at boot time. ++ ++ ++Design details. ++ ++Custom skip list implementation: ++ ++To avoid the overhead of building up and tearing down skip list structures, ++the variant used by MuQSS has a number of optimisations making it specific for ++its use case in the scheduler. It uses static arrays of 8 'levels' instead of ++building up and tearing down structures dynamically. This makes each runqueue ++only scale O(log N) up to 64k tasks. However as there is one runqueue per CPU ++it means that it scales O(log N) up to 64k x number of logical CPUs which is ++far beyond the realistic task limits each CPU could handle. By being 8 levels ++it also makes the array exactly one cacheline in size. Additionally, each ++skip list node is bidirectional making insertion and removal amortised O(1), ++being O(k) where k is 1-8. Uniquely, we are only ever interested in the very ++first entry in each list at all times with MuQSS, so there is never a need to ++do a search and thus look up is always O(1). In interactive mode, the queues ++will be searched beyond their first entry if the first task is not suitable ++for affinity or SMT nice reasons. ++ ++Task insertion: ++ ++MuQSS inserts tasks into a per CPU runqueue as an O(log N) insertion into ++a custom skip list as described above (based on the original design by William ++Pugh). Insertion is ordered in such a way that there is never a need to do a ++search by ordering tasks according to static priority primarily, and then ++virtual deadline at the time of insertion. ++ ++Niffies: ++ ++Niffies are a monotonic forward moving timer not unlike the "jiffies" but are ++of nanosecond resolution. Niffies are calculated per-runqueue from the high ++resolution TSC timers, and in order to maintain fairness are synchronised ++between CPUs whenever both runqueues are locked concurrently. ++ ++Virtual deadline: ++ ++The key to achieving low latency, scheduling fairness, and "nice level" ++distribution in MuQSS is entirely in the virtual deadline mechanism. The one ++tunable in MuQSS is the rr_interval, or "round robin interval". This is the ++maximum time two SCHED_OTHER (or SCHED_NORMAL, the common scheduling policy) ++tasks of the same nice level will be running for, or looking at it the other ++way around, the longest duration two tasks of the same nice level will be ++delayed for. When a task requests cpu time, it is given a quota (time_slice) ++equal to the rr_interval and a virtual deadline. The virtual deadline is ++offset from the current time in niffies by this equation: ++ ++ niffies + (prio_ratio * rr_interval) ++ ++The prio_ratio is determined as a ratio compared to the baseline of nice -20 ++and increases by 10% per nice level. The deadline is a virtual one only in that ++no guarantee is placed that a task will actually be scheduled by this time, but ++it is used to compare which task should go next. There are three components to ++how a task is next chosen. First is time_slice expiration. If a task runs out ++of its time_slice, it is descheduled, the time_slice is refilled, and the ++deadline reset to that formula above. Second is sleep, where a task no longer ++is requesting CPU for whatever reason. The time_slice and deadline are _not_ ++adjusted in this case and are just carried over for when the task is next ++scheduled. Third is preemption, and that is when a newly waking task is deemed ++higher priority than a currently running task on any cpu by virtue of the fact ++that it has an earlier virtual deadline than the currently running task. The ++earlier deadline is the key to which task is next chosen for the first and ++second cases. ++ ++The CPU proportion of different nice tasks works out to be approximately the ++ ++ (prio_ratio difference)^2 ++ ++The reason it is squared is that a task's deadline does not change while it is ++running unless it runs out of time_slice. Thus, even if the time actually ++passes the deadline of another task that is queued, it will not get CPU time ++unless the current running task deschedules, and the time "base" (niffies) is ++constantly moving. ++ ++Task lookup: ++ ++As tasks are already pre-ordered according to anticipated scheduling order in ++the skip lists, lookup for the next suitable task per-runqueue is always a ++matter of simply selecting the first task in the 0th level skip list entry. ++In order to maintain optimal latency and fairness across CPUs, MuQSS does a ++novel examination of every other runqueue in cache locality order, choosing the ++best task across all runqueues. This provides near-determinism of how long any ++task across the entire system may wait before receiving CPU time. The other ++runqueues are first examine lockless and then trylocked to minimise the ++potential lock contention if they are likely to have a suitable better task. ++Each other runqueue lock is only held for as long as it takes to examine the ++entry for suitability. In "interactive" mode, the default setting, MuQSS will ++look for the best deadline task across all CPUs, while in !interactive mode, ++it will only select a better deadline task from another CPU if it is more ++heavily laden than the current one. ++ ++Lookup is therefore O(k) where k is number of CPUs. ++ ++ ++Latency. ++ ++Through the use of virtual deadlines to govern the scheduling order of normal ++tasks, queue-to-activation latency per runqueue is guaranteed to be bound by ++the rr_interval tunable which is set to 6ms by default. This means that the ++longest a CPU bound task will wait for more CPU is proportional to the number ++of running tasks and in the common case of 0-2 running tasks per CPU, will be ++under the 7ms threshold for human perception of jitter. Additionally, as newly ++woken tasks will have an early deadline from their previous runtime, the very ++tasks that are usually latency sensitive will have the shortest interval for ++activation, usually preempting any existing CPU bound tasks. ++ ++Tickless expiry: ++ ++A feature of MuQSS is that it is not tied to the resolution of the chosen tick ++rate in Hz, instead depending entirely on the high resolution timers where ++possible for sub-millisecond accuracy on timeouts regarless of the underlying ++tick rate. This allows MuQSS to be run with the low overhead of low Hz rates ++such as 100 by default, benefiting from the improved throughput and lower ++power usage it provides. Another advantage of this approach is that in ++combination with the Full No HZ option, which disables ticks on running task ++CPUs instead of just idle CPUs, the tick can be disabled at all times ++regardless of how many tasks are running instead of being limited to just one ++running task. Note that this option is NOT recommended for regular desktop ++users. ++ ++ ++Scalability and balancing. ++ ++Unlike traditional approaches where balancing is a combination of CPU selection ++at task wakeup and intermittent balancing based on a vast array of rules set ++according to architecture, busyness calculations and special case management, ++MuQSS indirectly balances on the fly at task wakeup and next task selection. ++During initialisation, MuQSS creates a cache coherency ordered list of CPUs for ++each logical CPU and uses this to aid task/CPU selection when CPUs are busy. ++Additionally it selects any idle CPUs, if they are available, at any time over ++busy CPUs according to the following preference: ++ ++ * Same thread, idle or busy cache, idle or busy threads ++ * Other core, same cache, idle or busy cache, idle threads. ++ * Same node, other CPU, idle cache, idle threads. ++ * Same node, other CPU, busy cache, idle threads. ++ * Other core, same cache, busy threads. ++ * Same node, other CPU, busy threads. ++ * Other node, other CPU, idle cache, idle threads. ++ * Other node, other CPU, busy cache, idle threads. ++ * Other node, other CPU, busy threads. ++ ++Mux is therefore SMT, MC and Numa aware without the need for extra ++intermittent balancing to maintain CPUs busy and make the most of cache ++coherency. ++ ++ ++Features ++ ++As the initial prime target audience for MuQSS was the average desktop user, it ++was designed to not need tweaking, tuning or have features set to obtain benefit ++from it. Thus the number of knobs and features has been kept to an absolute ++minimum and should not require extra user input for the vast majority of cases. ++There are 3 optional tunables, and 2 extra scheduling policies. The rr_interval, ++interactive, and iso_cpu tunables, and the SCHED_ISO and SCHED_IDLEPRIO ++policies. In addition to this, MuQSS also uses sub-tick accounting. What MuQSS ++does _not_ now feature is support for CGROUPS. The average user should neither ++need to know what these are, nor should they need to be using them to have good ++desktop behaviour. However since some applications refuse to work without ++cgroups, one can enable them with MuQSS as a stub and the filesystem will be ++created which will allow the applications to work. ++ ++rr_interval: ++ ++ /proc/sys/kernel/rr_interval ++ ++The value is in milliseconds, and the default value is set to 6. Valid values ++are from 1 to 1000 Decreasing the value will decrease latencies at the cost of ++decreasing throughput, while increasing it will improve throughput, but at the ++cost of worsening latencies. It is based on the fact that humans can detect ++jitter at approximately 7ms, so aiming for much lower latencies is pointless ++under most circumstances. It is worth noting this fact when comparing the ++latency performance of MuQSS to other schedulers. Worst case latencies being ++higher than 7ms are far worse than average latencies not being in the ++microsecond range. ++ ++interactive: ++ ++ /proc/sys/kernel/interactive ++ ++The value is a simple boolean of 1 for on and 0 for off and is set to on by ++default. Disabling this will disable the near-determinism of MuQSS when ++selecting the next task by not examining all CPUs for the earliest deadline ++task, or which CPU to wake to, instead prioritising CPU balancing for improved ++throughput. Latency will still be bound by rr_interval, but on a per-CPU basis ++instead of across the whole system. ++ ++Runqueue sharing. ++ ++By default MuQSS chooses to share runqueue resources (specifically the skip ++list and locking) between multicore siblings. It is configurable at build time ++to select between None, SMT, MC and SMP, corresponding to no sharing, sharing ++only between simultaneous mulithreading siblings, multicore siblings, or ++symmetric multiprocessing physical packages. Additionally it can be se at ++bootime with the use of the rqshare parameter. The reason for configurability ++is that some architectures have CPUs with many multicore siblings (>= 16) ++where it may be detrimental to throughput to share runqueues and another ++sharing option may be desirable. Additionally, more sharing than usual can ++improve latency on a system-wide level at the expense of throughput if desired. ++ ++The options are: ++none, smt, mc, smp ++ ++eg: ++ rqshare=mc ++ ++Isochronous scheduling: ++ ++Isochronous scheduling is a unique scheduling policy designed to provide ++near-real-time performance to unprivileged (ie non-root) users without the ++ability to starve the machine indefinitely. Isochronous tasks (which means ++"same time") are set using, for example, the schedtool application like so: ++ ++ schedtool -I -e amarok ++ ++This will start the audio application "amarok" as SCHED_ISO. How SCHED_ISO works ++is that it has a priority level between true realtime tasks and SCHED_NORMAL ++which would allow them to preempt all normal tasks, in a SCHED_RR fashion (ie, ++if multiple SCHED_ISO tasks are running, they purely round robin at rr_interval ++rate). However if ISO tasks run for more than a tunable finite amount of time, ++they are then demoted back to SCHED_NORMAL scheduling. This finite amount of ++time is the percentage of CPU available per CPU, configurable as a percentage in ++the following "resource handling" tunable (as opposed to a scheduler tunable): ++ ++iso_cpu: ++ ++ /proc/sys/kernel/iso_cpu ++ ++and is set to 70% by default. It is calculated over a rolling 5 second average ++Because it is the total CPU available, it means that on a multi CPU machine, it ++is possible to have an ISO task running as realtime scheduling indefinitely on ++just one CPU, as the other CPUs will be available. Setting this to 100 is the ++equivalent of giving all users SCHED_RR access and setting it to 0 removes the ++ability to run any pseudo-realtime tasks. ++ ++A feature of MuQSS is that it detects when an application tries to obtain a ++realtime policy (SCHED_RR or SCHED_FIFO) and the caller does not have the ++appropriate privileges to use those policies. When it detects this, it will ++give the task SCHED_ISO policy instead. Thus it is transparent to the user. ++ ++ ++Idleprio scheduling: ++ ++Idleprio scheduling is a scheduling policy designed to give out CPU to a task ++_only_ when the CPU would be otherwise idle. The idea behind this is to allow ++ultra low priority tasks to be run in the background that have virtually no ++effect on the foreground tasks. This is ideally suited to distributed computing ++clients (like setiathome, folding, mprime etc) but can also be used to start a ++video encode or so on without any slowdown of other tasks. To avoid this policy ++from grabbing shared resources and holding them indefinitely, if it detects a ++state where the task is waiting on I/O, the machine is about to suspend to ram ++and so on, it will transiently schedule them as SCHED_NORMAL. Once a task has ++been scheduled as IDLEPRIO, it cannot be put back to SCHED_NORMAL without ++superuser privileges since it is effectively a lower scheduling policy. Tasks ++can be set to start as SCHED_IDLEPRIO with the schedtool command like so: ++ ++schedtool -D -e ./mprime ++ ++Subtick accounting: ++ ++It is surprisingly difficult to get accurate CPU accounting, and in many cases, ++the accounting is done by simply determining what is happening at the precise ++moment a timer tick fires off. This becomes increasingly inaccurate as the timer ++tick frequency (HZ) is lowered. It is possible to create an application which ++uses almost 100% CPU, yet by being descheduled at the right time, records zero ++CPU usage. While the main problem with this is that there are possible security ++implications, it is also difficult to determine how much CPU a task really does ++use. Mux uses sub-tick accounting from the TSC clock to determine real CPU ++usage. Thus, the amount of CPU reported as being used by MuQSS will more ++accurately represent how much CPU the task itself is using (as is shown for ++example by the 'time' application), so the reported values may be quite ++different to other schedulers. When comparing throughput of MuQSS to other ++designs, it is important to compare the actual completed work in terms of total ++wall clock time taken and total work done, rather than the reported "cpu usage". ++ ++Symmetric MultiThreading (SMT) aware nice: ++ ++SMT, a.k.a. hyperthreading, is a very common feature on modern CPUs. While the ++logical CPU count rises by adding thread units to each CPU core, allowing more ++than one task to be run simultaneously on the same core, the disadvantage of it ++is that the CPU power is shared between the tasks, not summating to the power ++of two CPUs. The practical upshot of this is that two tasks running on ++separate threads of the same core run significantly slower than if they had one ++core each to run on. While smart CPU selection allows each task to have a core ++to itself whenever available (as is done on MuQSS), it cannot offset the ++slowdown that occurs when the cores are all loaded and only a thread is left. ++Most of the time this is harmless as the CPU is effectively overloaded at this ++point and the extra thread is of benefit. However when running a niced task in ++the presence of an un-niced task (say nice 19 v nice 0), the nice task gets ++precisely the same amount of CPU power as the unniced one. MuQSS has an ++optional configuration feature known as SMT-NICE which selectively idles the ++secondary niced thread for a period proportional to the nice difference, ++allowing CPU distribution according to nice level to be maintained, at the ++expense of a small amount of extra overhead. If this is configured in on a ++machine without SMT threads, the overhead is minimal. ++ ++ ++Con Kolivas Sat, 29th October 2016 +diff --git a/Makefile b/Makefile +index 6a01b073915e..dd1876725cef 100644 +--- a/Makefile ++++ b/Makefile +@@ -15,6 +15,10 @@ NAME = Kleptomaniac Octopus + PHONY := _all + _all: + ++CKVERSION = -ck1 ++CKNAME = MuQSS Powered ++EXTRAVERSION := $(EXTRAVERSION)$(CKVERSION) ++ + # We are using a recursive build, so we need to do a little thinking + # to get the ordering right. + # +diff --git a/arch/alpha/Kconfig b/arch/alpha/Kconfig +index ef179033a7c2..14b576a531ad 100644 +--- a/arch/alpha/Kconfig ++++ b/arch/alpha/Kconfig +@@ -665,6 +665,8 @@ config HZ + default 1200 if HZ_1200 + default 1024 + ++source "kernel/Kconfig.MuQSS" ++ + config SRM_ENV + tristate "SRM environment through procfs" + depends on PROC_FS +diff --git a/arch/arc/configs/tb10x_defconfig b/arch/arc/configs/tb10x_defconfig +index a12656ec0072..b46b6ddc7636 100644 +--- a/arch/arc/configs/tb10x_defconfig ++++ b/arch/arc/configs/tb10x_defconfig +@@ -29,7 +29,7 @@ CONFIG_ARC_PLAT_TB10X=y + CONFIG_ARC_CACHE_LINE_SHIFT=5 + CONFIG_HZ=250 + CONFIG_ARC_BUILTIN_DTB_NAME="abilis_tb100_dvk" +-CONFIG_PREEMPT_VOLUNTARY=y ++CONFIG_PREEMPT=y + # CONFIG_COMPACTION is not set + CONFIG_NET=y + CONFIG_PACKET=y +diff --git a/arch/arm/Kconfig b/arch/arm/Kconfig +index 96dab76da3b3..78611fa96178 100644 +--- a/arch/arm/Kconfig ++++ b/arch/arm/Kconfig +@@ -1237,6 +1237,8 @@ config SCHED_SMT + MultiThreading at a cost of slightly increased overhead in some + places. If unsure say N here. + ++source "kernel/Kconfig.MuQSS" ++ + config HAVE_ARM_SCU + bool + help +diff --git a/arch/arm/configs/bcm2835_defconfig b/arch/arm/configs/bcm2835_defconfig +index 519ff58e67b3..b2a05b6f7d80 100644 +--- a/arch/arm/configs/bcm2835_defconfig ++++ b/arch/arm/configs/bcm2835_defconfig +@@ -29,7 +29,7 @@ CONFIG_MODULE_UNLOAD=y + CONFIG_ARCH_MULTI_V6=y + CONFIG_ARCH_BCM=y + CONFIG_ARCH_BCM2835=y +-CONFIG_PREEMPT_VOLUNTARY=y ++CONFIG_PREEMPT=y + CONFIG_AEABI=y + CONFIG_KSM=y + CONFIG_CLEANCACHE=y +diff --git a/arch/arm/configs/imx_v6_v7_defconfig b/arch/arm/configs/imx_v6_v7_defconfig +index 3608e55eaecd..614a0a30cc70 100644 +--- a/arch/arm/configs/imx_v6_v7_defconfig ++++ b/arch/arm/configs/imx_v6_v7_defconfig +@@ -45,6 +45,7 @@ CONFIG_PCI_MSI=y + CONFIG_PCI_IMX6=y + CONFIG_SMP=y + CONFIG_ARM_PSCI=y ++CONFIG_PREEMPT=y + CONFIG_HIGHMEM=y + CONFIG_FORCE_MAX_ZONEORDER=14 + CONFIG_CMDLINE="noinitrd console=ttymxc0,115200" +diff --git a/arch/arm/configs/mps2_defconfig b/arch/arm/configs/mps2_defconfig +index 1d923dbb9928..9c1931f1fafd 100644 +--- a/arch/arm/configs/mps2_defconfig ++++ b/arch/arm/configs/mps2_defconfig +@@ -18,7 +18,7 @@ CONFIG_ARCH_MPS2=y + CONFIG_SET_MEM_PARAM=y + CONFIG_DRAM_BASE=0x21000000 + CONFIG_DRAM_SIZE=0x1000000 +-CONFIG_PREEMPT_VOLUNTARY=y ++CONFIG_PREEMPT=y + # CONFIG_ATAGS is not set + CONFIG_ZBOOT_ROM_TEXT=0x0 + CONFIG_ZBOOT_ROM_BSS=0x0 +diff --git a/arch/arm/configs/mxs_defconfig b/arch/arm/configs/mxs_defconfig +index 2773899c21b3..870866aaa39d 100644 +--- a/arch/arm/configs/mxs_defconfig ++++ b/arch/arm/configs/mxs_defconfig +@@ -1,7 +1,7 @@ + CONFIG_SYSVIPC=y + CONFIG_NO_HZ=y + CONFIG_HIGH_RES_TIMERS=y +-CONFIG_PREEMPT_VOLUNTARY=y ++CONFIG_PREEMPT_VOLUNTARY=n + CONFIG_TASKSTATS=y + CONFIG_TASK_DELAY_ACCT=y + CONFIG_TASK_XACCT=y +@@ -27,6 +27,11 @@ CONFIG_MODVERSIONS=y + CONFIG_BLK_DEV_INTEGRITY=y + # CONFIG_IOSCHED_DEADLINE is not set + # CONFIG_IOSCHED_CFQ is not set ++# CONFIG_ARCH_MULTI_V7 is not set ++CONFIG_ARCH_MXS=y ++# CONFIG_ARM_THUMB is not set ++CONFIG_PREEMPT=y ++CONFIG_AEABI=y + CONFIG_NET=y + CONFIG_PACKET=y + CONFIG_UNIX=y +diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig +index e688dfad0b72..000a2e0d01ea 100644 +--- a/arch/arm64/Kconfig ++++ b/arch/arm64/Kconfig +@@ -911,6 +911,8 @@ config SCHED_SMT + MultiThreading at a cost of slightly increased overhead in some + places. If unsure say N here. + ++source "kernel/Kconfig.MuQSS" ++ + config NR_CPUS + int "Maximum number of CPUs (2-4096)" + range 2 4096 +diff --git a/arch/mips/configs/fuloong2e_defconfig b/arch/mips/configs/fuloong2e_defconfig +index 1788ae23bff9..8daf4e1eebcd 100644 +--- a/arch/mips/configs/fuloong2e_defconfig ++++ b/arch/mips/configs/fuloong2e_defconfig +@@ -4,7 +4,7 @@ CONFIG_SYSVIPC=y + CONFIG_POSIX_MQUEUE=y + CONFIG_NO_HZ=y + CONFIG_HIGH_RES_TIMERS=y +-CONFIG_PREEMPT_VOLUNTARY=y ++CONFIG_PREEMPT=y + CONFIG_BSD_PROCESS_ACCT=y + CONFIG_IKCONFIG=y + CONFIG_IKCONFIG_PROC=y +diff --git a/arch/mips/configs/gpr_defconfig b/arch/mips/configs/gpr_defconfig +index 9085f4d6c698..fb23111d45f6 100644 +--- a/arch/mips/configs/gpr_defconfig ++++ b/arch/mips/configs/gpr_defconfig +@@ -1,8 +1,8 @@ ++CONFIG_PREEMPT=y + # CONFIG_LOCALVERSION_AUTO is not set + CONFIG_SYSVIPC=y + CONFIG_POSIX_MQUEUE=y + CONFIG_HIGH_RES_TIMERS=y +-CONFIG_PREEMPT_VOLUNTARY=y + CONFIG_BSD_PROCESS_ACCT=y + CONFIG_BSD_PROCESS_ACCT_V3=y + CONFIG_RELAY=y +diff --git a/arch/mips/configs/ip22_defconfig b/arch/mips/configs/ip22_defconfig +index 21a1168ae301..529a1b1007cf 100644 +--- a/arch/mips/configs/ip22_defconfig ++++ b/arch/mips/configs/ip22_defconfig +@@ -1,7 +1,7 @@ + CONFIG_SYSVIPC=y + CONFIG_NO_HZ=y + CONFIG_HIGH_RES_TIMERS=y +-CONFIG_PREEMPT_VOLUNTARY=y ++CONFIG_PREEMPT=y + CONFIG_IKCONFIG=y + CONFIG_IKCONFIG_PROC=y + CONFIG_LOG_BUF_SHIFT=14 +diff --git a/arch/mips/configs/ip28_defconfig b/arch/mips/configs/ip28_defconfig +index 0921ef38e9fb..6da05cef46f8 100644 +--- a/arch/mips/configs/ip28_defconfig ++++ b/arch/mips/configs/ip28_defconfig +@@ -1,5 +1,5 @@ + CONFIG_SYSVIPC=y +-CONFIG_PREEMPT_VOLUNTARY=y ++CONFIG_PREEMPT=y + CONFIG_IKCONFIG=y + CONFIG_IKCONFIG_PROC=y + CONFIG_LOG_BUF_SHIFT=14 +diff --git a/arch/mips/configs/jazz_defconfig b/arch/mips/configs/jazz_defconfig +index 328d4dfeb4cb..e17cb23173ea 100644 +--- a/arch/mips/configs/jazz_defconfig ++++ b/arch/mips/configs/jazz_defconfig +@@ -1,6 +1,6 @@ ++CONFIG_PREEMPT=y + CONFIG_SYSVIPC=y + CONFIG_POSIX_MQUEUE=y +-CONFIG_PREEMPT_VOLUNTARY=y + CONFIG_BSD_PROCESS_ACCT=y + CONFIG_IKCONFIG=y + CONFIG_IKCONFIG_PROC=y +diff --git a/arch/mips/configs/mtx1_defconfig b/arch/mips/configs/mtx1_defconfig +index 914af125a7fa..76a64290373f 100644 +--- a/arch/mips/configs/mtx1_defconfig ++++ b/arch/mips/configs/mtx1_defconfig +@@ -1,8 +1,8 @@ ++CONFIG_PREEMPT=y + # CONFIG_LOCALVERSION_AUTO is not set + CONFIG_SYSVIPC=y + CONFIG_POSIX_MQUEUE=y + CONFIG_AUDIT=y +-CONFIG_PREEMPT_VOLUNTARY=y + CONFIG_BSD_PROCESS_ACCT=y + CONFIG_BSD_PROCESS_ACCT_V3=y + CONFIG_RELAY=y +diff --git a/arch/mips/configs/nlm_xlr_defconfig b/arch/mips/configs/nlm_xlr_defconfig +index 4ecb157e56d4..ea7309283b01 100644 +--- a/arch/mips/configs/nlm_xlr_defconfig ++++ b/arch/mips/configs/nlm_xlr_defconfig +@@ -1,10 +1,10 @@ ++CONFIG_PREEMPT=y + # CONFIG_LOCALVERSION_AUTO is not set + CONFIG_SYSVIPC=y + CONFIG_POSIX_MQUEUE=y + CONFIG_AUDIT=y + CONFIG_NO_HZ=y + CONFIG_HIGH_RES_TIMERS=y +-CONFIG_PREEMPT_VOLUNTARY=y + CONFIG_BSD_PROCESS_ACCT=y + CONFIG_BSD_PROCESS_ACCT_V3=y + CONFIG_TASKSTATS=y +diff --git a/arch/mips/configs/pic32mzda_defconfig b/arch/mips/configs/pic32mzda_defconfig +index 63fe2da1b37f..7f08ee237345 100644 +--- a/arch/mips/configs/pic32mzda_defconfig ++++ b/arch/mips/configs/pic32mzda_defconfig +@@ -1,7 +1,7 @@ ++CONFIG_PREEMPT=y + CONFIG_SYSVIPC=y + CONFIG_NO_HZ=y + CONFIG_HIGH_RES_TIMERS=y +-CONFIG_PREEMPT_VOLUNTARY=y + CONFIG_IKCONFIG=y + CONFIG_IKCONFIG_PROC=y + CONFIG_LOG_BUF_SHIFT=14 +diff --git a/arch/mips/configs/pistachio_defconfig b/arch/mips/configs/pistachio_defconfig +index 24e07180c57d..38582e8f71c4 100644 +--- a/arch/mips/configs/pistachio_defconfig ++++ b/arch/mips/configs/pistachio_defconfig +@@ -1,9 +1,9 @@ ++CONFIG_PREEMPT=y + # CONFIG_LOCALVERSION_AUTO is not set + CONFIG_DEFAULT_HOSTNAME="localhost" + CONFIG_SYSVIPC=y + CONFIG_NO_HZ=y + CONFIG_HIGH_RES_TIMERS=y +-CONFIG_PREEMPT_VOLUNTARY=y + CONFIG_IKCONFIG=m + CONFIG_IKCONFIG_PROC=y + CONFIG_LOG_BUF_SHIFT=18 +diff --git a/arch/mips/configs/pnx8335_stb225_defconfig b/arch/mips/configs/pnx8335_stb225_defconfig +index 738ba3b1374b..6a3267e8aa0d 100644 +--- a/arch/mips/configs/pnx8335_stb225_defconfig ++++ b/arch/mips/configs/pnx8335_stb225_defconfig +@@ -1,9 +1,9 @@ ++CONFIG_PREEMPT=y + # CONFIG_LOCALVERSION_AUTO is not set + # CONFIG_SWAP is not set + CONFIG_SYSVIPC=y + CONFIG_NO_HZ=y + CONFIG_HIGH_RES_TIMERS=y +-CONFIG_PREEMPT_VOLUNTARY=y + CONFIG_LOG_BUF_SHIFT=14 + CONFIG_EXPERT=y + CONFIG_SLAB=y +diff --git a/arch/mips/configs/rm200_defconfig b/arch/mips/configs/rm200_defconfig +index 2c7adea7638f..1c82d62bee72 100644 +--- a/arch/mips/configs/rm200_defconfig ++++ b/arch/mips/configs/rm200_defconfig +@@ -1,6 +1,6 @@ ++CONFIG_PREEMPT=y + CONFIG_SYSVIPC=y + CONFIG_POSIX_MQUEUE=y +-CONFIG_PREEMPT_VOLUNTARY=y + CONFIG_BSD_PROCESS_ACCT=y + CONFIG_IKCONFIG=y + CONFIG_IKCONFIG_PROC=y +diff --git a/arch/parisc/configs/712_defconfig b/arch/parisc/configs/712_defconfig +index d3e3d94e90c3..578524f80cc4 100644 +--- a/arch/parisc/configs/712_defconfig ++++ b/arch/parisc/configs/712_defconfig +@@ -13,7 +13,7 @@ CONFIG_MODULES=y + CONFIG_MODULE_UNLOAD=y + CONFIG_MODULE_FORCE_UNLOAD=y + CONFIG_PA7100LC=y +-CONFIG_PREEMPT_VOLUNTARY=y ++CONFIG_PREEMPT=y + CONFIG_GSC_LASI=y + # CONFIG_PDC_CHASSIS is not set + CONFIG_BINFMT_MISC=m +diff --git a/arch/parisc/configs/c3000_defconfig b/arch/parisc/configs/c3000_defconfig +index 64d45a8b6ca0..d1bdfad94048 100644 +--- a/arch/parisc/configs/c3000_defconfig ++++ b/arch/parisc/configs/c3000_defconfig +@@ -13,7 +13,7 @@ CONFIG_MODULES=y + CONFIG_MODULE_UNLOAD=y + CONFIG_MODULE_FORCE_UNLOAD=y + CONFIG_PA8X00=y +-CONFIG_PREEMPT_VOLUNTARY=y ++CONFIG_PREEMPT=y + # CONFIG_GSC is not set + CONFIG_PCI=y + CONFIG_PCI_LBA=y +diff --git a/arch/parisc/configs/defconfig b/arch/parisc/configs/defconfig +index 5b877ca34ebf..0d976614934c 100644 +--- a/arch/parisc/configs/defconfig ++++ b/arch/parisc/configs/defconfig +@@ -14,7 +14,7 @@ CONFIG_MODULE_UNLOAD=y + CONFIG_MODULE_FORCE_UNLOAD=y + # CONFIG_BLK_DEV_BSG is not set + CONFIG_PA7100LC=y +-CONFIG_PREEMPT_VOLUNTARY=y ++CONFIG_PREEMPT=y + CONFIG_IOMMU_CCIO=y + CONFIG_GSC_LASI=y + CONFIG_GSC_WAX=y +diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig +index e2a412113359..a8fa2e3c05a6 100644 +--- a/arch/powerpc/Kconfig ++++ b/arch/powerpc/Kconfig +@@ -878,6 +878,8 @@ config SCHED_SMT + when dealing with POWER5 cpus at a cost of slightly increased + overhead in some places. If unsure say N here. + ++source "kernel/Kconfig.MuQSS" ++ + config PPC_DENORMALISATION + bool "PowerPC denormalisation exception handling" + depends on PPC_BOOK3S_64 +diff --git a/arch/powerpc/configs/ppc6xx_defconfig b/arch/powerpc/configs/ppc6xx_defconfig +index 7e28919041cf..b22d0a7c2751 100644 +--- a/arch/powerpc/configs/ppc6xx_defconfig ++++ b/arch/powerpc/configs/ppc6xx_defconfig +@@ -74,7 +74,7 @@ CONFIG_QE_GPIO=y + CONFIG_MCU_MPC8349EMITX=y + CONFIG_HIGHMEM=y + CONFIG_HZ_1000=y +-CONFIG_PREEMPT_VOLUNTARY=y ++CONFIG_PREEMPT=y + CONFIG_BINFMT_MISC=y + CONFIG_HIBERNATION=y + CONFIG_PM_DEBUG=y +diff --git a/arch/powerpc/platforms/cell/spufs/sched.c b/arch/powerpc/platforms/cell/spufs/sched.c +index f18d5067cd0f..fe489fc01c73 100644 +--- a/arch/powerpc/platforms/cell/spufs/sched.c ++++ b/arch/powerpc/platforms/cell/spufs/sched.c +@@ -51,11 +51,6 @@ static struct task_struct *spusched_task; + static struct timer_list spusched_timer; + static struct timer_list spuloadavg_timer; + +-/* +- * Priority of a normal, non-rt, non-niced'd process (aka nice level 0). +- */ +-#define NORMAL_PRIO 120 +- + /* + * Frequency of the spu scheduler tick. By default we do one SPU scheduler + * tick for every 10 CPU scheduler ticks. +diff --git a/arch/sh/configs/se7712_defconfig b/arch/sh/configs/se7712_defconfig +index 9a527f978106..5895f2cc726e 100644 +--- a/arch/sh/configs/se7712_defconfig ++++ b/arch/sh/configs/se7712_defconfig +@@ -23,7 +23,7 @@ CONFIG_FLATMEM_MANUAL=y + CONFIG_SH_SOLUTION_ENGINE=y + CONFIG_SH_PCLK_FREQ=66666666 + CONFIG_HEARTBEAT=y +-CONFIG_PREEMPT_VOLUNTARY=y ++CONFIG_PREEMPT=y + CONFIG_CMDLINE_OVERWRITE=y + CONFIG_CMDLINE="console=ttySC0,115200 root=/dev/sda1" + CONFIG_NET=y +diff --git a/arch/sh/configs/se7721_defconfig b/arch/sh/configs/se7721_defconfig +index 3b0e1eb6e874..e296a2cd9903 100644 +--- a/arch/sh/configs/se7721_defconfig ++++ b/arch/sh/configs/se7721_defconfig +@@ -23,7 +23,7 @@ CONFIG_FLATMEM_MANUAL=y + CONFIG_SH_7721_SOLUTION_ENGINE=y + CONFIG_SH_PCLK_FREQ=33333333 + CONFIG_HEARTBEAT=y +-CONFIG_PREEMPT_VOLUNTARY=y ++CONFIG_PREEMPT=y + CONFIG_CMDLINE_OVERWRITE=y + CONFIG_CMDLINE="console=ttySC0,115200 root=/dev/sda2" + CONFIG_NET=y +diff --git a/arch/sh/configs/titan_defconfig b/arch/sh/configs/titan_defconfig +index 4ec961ace688..a03a1ad670a0 100644 +--- a/arch/sh/configs/titan_defconfig ++++ b/arch/sh/configs/titan_defconfig +@@ -20,7 +20,7 @@ CONFIG_SH_TITAN=y + CONFIG_SH_PCLK_FREQ=30000000 + CONFIG_SH_DMA=y + CONFIG_SH_DMA_API=y +-CONFIG_PREEMPT_VOLUNTARY=y ++CONFIG_PREEMPT=y + CONFIG_CMDLINE_OVERWRITE=y + CONFIG_CMDLINE="console=ttySC1,38400N81 root=/dev/nfs ip=:::::eth1:autoconf rw" + CONFIG_PCI=y +diff --git a/arch/sparc/configs/sparc64_defconfig b/arch/sparc/configs/sparc64_defconfig +index 6c325d53a20a..98d4ef3d76cf 100644 +--- a/arch/sparc/configs/sparc64_defconfig ++++ b/arch/sparc/configs/sparc64_defconfig +@@ -22,7 +22,7 @@ CONFIG_NO_HZ=y + CONFIG_HIGH_RES_TIMERS=y + CONFIG_NUMA=y + CONFIG_DEFAULT_MMAP_MIN_ADDR=8192 +-CONFIG_PREEMPT_VOLUNTARY=y ++CONFIG_PREEMPT=y + CONFIG_SUN_LDOMS=y + CONFIG_PCI=y + CONFIG_PCI_MSI=y +diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig +index 5e8949953660..4f6afc56ff5f 100644 +--- a/arch/x86/Kconfig ++++ b/arch/x86/Kconfig +@@ -1004,6 +1004,22 @@ config NR_CPUS + config SCHED_SMT + def_bool y if SMP + ++config SMT_NICE ++ bool "SMT (Hyperthreading) aware nice priority and policy support" ++ depends on SCHED_MUQSS && SCHED_SMT ++ default y ++ ---help--- ++ Enabling Hyperthreading on Intel CPUs decreases the effectiveness ++ of the use of 'nice' levels and different scheduling policies ++ (e.g. realtime) due to sharing of CPU power between hyperthreads. ++ SMT nice support makes each logical CPU aware of what is running on ++ its hyperthread siblings, maintaining appropriate distribution of ++ CPU according to nice levels and scheduling policies at the expense ++ of slightly increased overhead. ++ ++ If unsure say Y here. ++ ++ + config SCHED_MC + def_bool y + prompt "Multi-core scheduler support" +@@ -1034,6 +1050,8 @@ config SCHED_MC_PRIO + + If unsure say Y here. + ++source "kernel/Kconfig.MuQSS" ++ + config UP_LATE_INIT + def_bool y + depends on !SMP && X86_LOCAL_APIC +@@ -1421,7 +1439,7 @@ config HIGHMEM64G + endchoice + + choice +- prompt "Memory split" if EXPERT ++ prompt "Memory split" + default VMSPLIT_3G + depends on X86_32 + ---help--- +@@ -1441,17 +1459,17 @@ choice + option alone! + + config VMSPLIT_3G +- bool "3G/1G user/kernel split" ++ bool "Default 896MB lowmem (3G/1G user/kernel split)" + config VMSPLIT_3G_OPT + depends on !X86_PAE +- bool "3G/1G user/kernel split (for full 1G low memory)" ++ bool "1GB lowmem (3G/1G user/kernel split)" + config VMSPLIT_2G +- bool "2G/2G user/kernel split" ++ bool "2GB lowmem (2G/2G user/kernel split)" + config VMSPLIT_2G_OPT + depends on !X86_PAE +- bool "2G/2G user/kernel split (for full 2G low memory)" ++ bool "2GB lowmem (2G/2G user/kernel split)" + config VMSPLIT_1G +- bool "1G/3G user/kernel split" ++ bool "3GB lowmem (1G/3G user/kernel split)" + endchoice + + config PAGE_OFFSET +diff --git a/arch/x86/configs/i386_defconfig b/arch/x86/configs/i386_defconfig +index 59ce9ed58430..f19741b0f43d 100644 +--- a/arch/x86/configs/i386_defconfig ++++ b/arch/x86/configs/i386_defconfig +@@ -29,7 +29,7 @@ CONFIG_SMP=y + CONFIG_X86_GENERIC=y + CONFIG_HPET_TIMER=y + CONFIG_SCHED_SMT=y +-CONFIG_PREEMPT_VOLUNTARY=y ++CONFIG_PREEMPT=y + CONFIG_X86_REROUTE_FOR_BROKEN_BOOT_IRQS=y + CONFIG_X86_MCE=y + CONFIG_X86_REBOOTFIXUPS=y +diff --git a/arch/x86/configs/x86_64_defconfig b/arch/x86/configs/x86_64_defconfig +index 0b9654c7a05c..eb361bdf6026 100644 +--- a/arch/x86/configs/x86_64_defconfig ++++ b/arch/x86/configs/x86_64_defconfig +@@ -27,7 +27,7 @@ CONFIG_MODULE_FORCE_UNLOAD=y + CONFIG_SMP=y + CONFIG_NR_CPUS=64 + CONFIG_SCHED_SMT=y +-CONFIG_PREEMPT_VOLUNTARY=y ++CONFIG_PREEMPT=y + CONFIG_X86_REROUTE_FOR_BROKEN_BOOT_IRQS=y + CONFIG_X86_MCE=y + CONFIG_MICROCODE=y +diff --git a/drivers/block/swim.c b/drivers/block/swim.c +index 4c297f69171d..5bc4f1be2617 100644 +--- a/drivers/block/swim.c ++++ b/drivers/block/swim.c +@@ -328,7 +328,7 @@ static inline void swim_motor(struct swim __iomem *base, + if (swim_readbit(base, MOTOR_ON)) + break; + current->state = TASK_INTERRUPTIBLE; +- schedule_timeout(1); ++ schedule_min_hrtimeout(); + } + } else if (action == OFF) { + swim_action(base, MOTOR_OFF); +@@ -347,7 +347,7 @@ static inline void swim_eject(struct swim __iomem *base) + if (!swim_readbit(base, DISK_IN)) + break; + current->state = TASK_INTERRUPTIBLE; +- schedule_timeout(1); ++ schedule_min_hrtimeout(); + } + swim_select(base, RELAX); + } +@@ -371,7 +371,7 @@ static inline int swim_step(struct swim __iomem *base) + for (wait = 0; wait < HZ; wait++) { + + current->state = TASK_INTERRUPTIBLE; +- schedule_timeout(1); ++ schedule_min_hrtimeout(); + + swim_select(base, RELAX); + if (!swim_readbit(base, STEP)) +diff --git a/drivers/char/ipmi/ipmi_msghandler.c b/drivers/char/ipmi/ipmi_msghandler.c +index cad9563f8f48..d7f93c85c2cb 100644 +--- a/drivers/char/ipmi/ipmi_msghandler.c ++++ b/drivers/char/ipmi/ipmi_msghandler.c +@@ -3537,7 +3537,7 @@ static void cleanup_smi_msgs(struct ipmi_smi *intf) + /* Current message first, to preserve order */ + while (intf->curr_msg && !list_empty(&intf->waiting_rcv_msgs)) { + /* Wait for the message to clear out. */ +- schedule_timeout(1); ++ schedule_min_hrtimeout(); + } + + /* No need for locks, the interface is down. */ +diff --git a/drivers/char/ipmi/ipmi_ssif.c b/drivers/char/ipmi/ipmi_ssif.c +index 22c6a2e61236..c4bccd444cbf 100644 +--- a/drivers/char/ipmi/ipmi_ssif.c ++++ b/drivers/char/ipmi/ipmi_ssif.c +@@ -1289,7 +1289,7 @@ static void shutdown_ssif(void *send_info) + + /* make sure the driver is not looking for flags any more. */ + while (ssif_info->ssif_state != SSIF_NORMAL) +- schedule_timeout(1); ++ schedule_min_hrtimeout(); + + ssif_info->stopping = true; + del_timer_sync(&ssif_info->watch_timer); +diff --git a/drivers/gpu/drm/vmwgfx/vmwgfx_fifo.c b/drivers/gpu/drm/vmwgfx/vmwgfx_fifo.c +index e5252ef3812f..6ae6241185ea 100644 +--- a/drivers/gpu/drm/vmwgfx/vmwgfx_fifo.c ++++ b/drivers/gpu/drm/vmwgfx/vmwgfx_fifo.c +@@ -237,7 +237,7 @@ static int vmw_fifo_wait_noirq(struct vmw_private *dev_priv, + DRM_ERROR("SVGA device lockup.\n"); + break; + } +- schedule_timeout(1); ++ schedule_min_hrtimeout(); + if (interruptible && signal_pending(current)) { + ret = -ERESTARTSYS; + break; +diff --git a/drivers/gpu/drm/vmwgfx/vmwgfx_irq.c b/drivers/gpu/drm/vmwgfx/vmwgfx_irq.c +index 75f3efee21a4..09b1932ce85b 100644 +--- a/drivers/gpu/drm/vmwgfx/vmwgfx_irq.c ++++ b/drivers/gpu/drm/vmwgfx/vmwgfx_irq.c +@@ -203,7 +203,7 @@ int vmw_fallback_wait(struct vmw_private *dev_priv, + break; + } + if (lazy) +- schedule_timeout(1); ++ schedule_min_hrtimeout(); + else if ((++count & 0x0F) == 0) { + /** + * FIXME: Use schedule_hr_timeout here for +diff --git a/drivers/hwmon/fam15h_power.c b/drivers/hwmon/fam15h_power.c +index 267eac00a3fb..352af68c6cd7 100644 +--- a/drivers/hwmon/fam15h_power.c ++++ b/drivers/hwmon/fam15h_power.c +@@ -225,7 +225,7 @@ static ssize_t power1_average_show(struct device *dev, + prev_ptsc[cu] = data->cpu_sw_pwr_ptsc[cu]; + } + +- leftover = schedule_timeout_interruptible(msecs_to_jiffies(data->power_period)); ++ leftover = schedule_msec_hrtimeout_interruptible((data->power_period)); + if (leftover) + return 0; + +diff --git a/drivers/iio/light/tsl2563.c b/drivers/iio/light/tsl2563.c +index d8c40a83097d..8332baf4961c 100644 +--- a/drivers/iio/light/tsl2563.c ++++ b/drivers/iio/light/tsl2563.c +@@ -269,11 +269,7 @@ static void tsl2563_wait_adc(struct tsl2563_chip *chip) + default: + delay = 402; + } +- /* +- * TODO: Make sure that we wait at least required delay but why we +- * have to extend it one tick more? +- */ +- schedule_timeout_interruptible(msecs_to_jiffies(delay) + 2); ++ schedule_msec_hrtimeout_interruptible(delay + 1); + } + + static int tsl2563_adjust_gainlevel(struct tsl2563_chip *chip, u16 adc) +diff --git a/drivers/media/i2c/msp3400-driver.c b/drivers/media/i2c/msp3400-driver.c +index 39530d43590e..a7caf2eb5771 100644 +--- a/drivers/media/i2c/msp3400-driver.c ++++ b/drivers/media/i2c/msp3400-driver.c +@@ -170,7 +170,7 @@ static int msp_read(struct i2c_client *client, int dev, int addr) + break; + dev_warn(&client->dev, "I/O error #%d (read 0x%02x/0x%02x)\n", err, + dev, addr); +- schedule_timeout_interruptible(msecs_to_jiffies(10)); ++ schedule_msec_hrtimeout_interruptible((10)); + } + if (err == 3) { + dev_warn(&client->dev, "resetting chip, sound will go off.\n"); +@@ -211,7 +211,7 @@ static int msp_write(struct i2c_client *client, int dev, int addr, int val) + break; + dev_warn(&client->dev, "I/O error #%d (write 0x%02x/0x%02x)\n", err, + dev, addr); +- schedule_timeout_interruptible(msecs_to_jiffies(10)); ++ schedule_msec_hrtimeout_interruptible((10)); + } + if (err == 3) { + dev_warn(&client->dev, "resetting chip, sound will go off.\n"); +diff --git a/drivers/media/pci/cx18/cx18-gpio.c b/drivers/media/pci/cx18/cx18-gpio.c +index cf7cfda94107..f63e17489547 100644 +--- a/drivers/media/pci/cx18/cx18-gpio.c ++++ b/drivers/media/pci/cx18/cx18-gpio.c +@@ -81,11 +81,11 @@ static void gpio_reset_seq(struct cx18 *cx, u32 active_lo, u32 active_hi, + + /* Assert */ + gpio_update(cx, mask, ~active_lo); +- schedule_timeout_uninterruptible(msecs_to_jiffies(assert_msecs)); ++ schedule_msec_hrtimeout_uninterruptible((assert_msecs)); + + /* Deassert */ + gpio_update(cx, mask, ~active_hi); +- schedule_timeout_uninterruptible(msecs_to_jiffies(recovery_msecs)); ++ schedule_msec_hrtimeout_uninterruptible((recovery_msecs)); + } + + /* +diff --git a/drivers/media/pci/ivtv/ivtv-gpio.c b/drivers/media/pci/ivtv/ivtv-gpio.c +index 856e7ab7f33e..766a26251337 100644 +--- a/drivers/media/pci/ivtv/ivtv-gpio.c ++++ b/drivers/media/pci/ivtv/ivtv-gpio.c +@@ -105,7 +105,7 @@ void ivtv_reset_ir_gpio(struct ivtv *itv) + curout = (curout & ~0xF) | 1; + write_reg(curout, IVTV_REG_GPIO_OUT); + /* We could use something else for smaller time */ +- schedule_timeout_interruptible(msecs_to_jiffies(1)); ++ schedule_msec_hrtimeout_interruptible((1)); + curout |= 2; + write_reg(curout, IVTV_REG_GPIO_OUT); + curdir &= ~0x80; +@@ -125,11 +125,11 @@ int ivtv_reset_tuner_gpio(void *dev, int component, int cmd, int value) + curout = read_reg(IVTV_REG_GPIO_OUT); + curout &= ~(1 << itv->card->xceive_pin); + write_reg(curout, IVTV_REG_GPIO_OUT); +- schedule_timeout_interruptible(msecs_to_jiffies(1)); ++ schedule_msec_hrtimeout_interruptible((1)); + + curout |= 1 << itv->card->xceive_pin; + write_reg(curout, IVTV_REG_GPIO_OUT); +- schedule_timeout_interruptible(msecs_to_jiffies(1)); ++ schedule_msec_hrtimeout_interruptible((1)); + return 0; + } + +diff --git a/drivers/media/pci/ivtv/ivtv-ioctl.c b/drivers/media/pci/ivtv/ivtv-ioctl.c +index 137853944e46..76830892f373 100644 +--- a/drivers/media/pci/ivtv/ivtv-ioctl.c ++++ b/drivers/media/pci/ivtv/ivtv-ioctl.c +@@ -1137,7 +1137,7 @@ void ivtv_s_std_dec(struct ivtv *itv, v4l2_std_id std) + TASK_UNINTERRUPTIBLE); + if ((read_reg(IVTV_REG_DEC_LINE_FIELD) >> 16) < 100) + break; +- schedule_timeout(msecs_to_jiffies(25)); ++ schedule_msec_hrtimeout((25)); + } + finish_wait(&itv->vsync_waitq, &wait); + mutex_lock(&itv->serialize_lock); +diff --git a/drivers/media/pci/ivtv/ivtv-streams.c b/drivers/media/pci/ivtv/ivtv-streams.c +index f7de9118f609..f39ad2952c0f 100644 +--- a/drivers/media/pci/ivtv/ivtv-streams.c ++++ b/drivers/media/pci/ivtv/ivtv-streams.c +@@ -849,7 +849,7 @@ int ivtv_stop_v4l2_encode_stream(struct ivtv_stream *s, int gop_end) + while (!test_bit(IVTV_F_I_EOS, &itv->i_flags) && + time_before(jiffies, + then + msecs_to_jiffies(2000))) { +- schedule_timeout(msecs_to_jiffies(10)); ++ schedule_msec_hrtimeout((10)); + } + + /* To convert jiffies to ms, we must multiply by 1000 +diff --git a/drivers/media/radio/radio-mr800.c b/drivers/media/radio/radio-mr800.c +index cb0437b4c331..163fffc0e1d4 100644 +--- a/drivers/media/radio/radio-mr800.c ++++ b/drivers/media/radio/radio-mr800.c +@@ -366,7 +366,7 @@ static int vidioc_s_hw_freq_seek(struct file *file, void *priv, + retval = -ENODATA; + break; + } +- if (schedule_timeout_interruptible(msecs_to_jiffies(10))) { ++ if (schedule_msec_hrtimeout_interruptible((10))) { + retval = -ERESTARTSYS; + break; + } +diff --git a/drivers/media/radio/radio-tea5777.c b/drivers/media/radio/radio-tea5777.c +index fb9de7bbcd19..e53cf45e7f3f 100644 +--- a/drivers/media/radio/radio-tea5777.c ++++ b/drivers/media/radio/radio-tea5777.c +@@ -235,7 +235,7 @@ static int radio_tea5777_update_read_reg(struct radio_tea5777 *tea, int wait) + } + + if (wait) { +- if (schedule_timeout_interruptible(msecs_to_jiffies(wait))) ++ if (schedule_msec_hrtimeout_interruptible((wait))) + return -ERESTARTSYS; + } + +diff --git a/drivers/media/radio/tea575x.c b/drivers/media/radio/tea575x.c +index b0303cf00387..0925b5065147 100644 +--- a/drivers/media/radio/tea575x.c ++++ b/drivers/media/radio/tea575x.c +@@ -401,7 +401,7 @@ int snd_tea575x_s_hw_freq_seek(struct file *file, struct snd_tea575x *tea, + for (;;) { + if (time_after(jiffies, timeout)) + break; +- if (schedule_timeout_interruptible(msecs_to_jiffies(10))) { ++ if (schedule_msec_hrtimeout_interruptible((10))) { + /* some signal arrived, stop search */ + tea->val &= ~TEA575X_BIT_SEARCH; + snd_tea575x_set_freq(tea); +diff --git a/drivers/mfd/ucb1x00-core.c b/drivers/mfd/ucb1x00-core.c +index b690796d24d4..448b13da62b4 100644 +--- a/drivers/mfd/ucb1x00-core.c ++++ b/drivers/mfd/ucb1x00-core.c +@@ -250,7 +250,7 @@ unsigned int ucb1x00_adc_read(struct ucb1x00 *ucb, int adc_channel, int sync) + break; + /* yield to other processes */ + set_current_state(TASK_INTERRUPTIBLE); +- schedule_timeout(1); ++ schedule_min_hrtimeout(); + } + + return UCB_ADC_DAT(val); +diff --git a/drivers/misc/sgi-xp/xpc_channel.c b/drivers/misc/sgi-xp/xpc_channel.c +index 8e6607fc8a67..b9ab770bbdb5 100644 +--- a/drivers/misc/sgi-xp/xpc_channel.c ++++ b/drivers/misc/sgi-xp/xpc_channel.c +@@ -834,7 +834,7 @@ xpc_allocate_msg_wait(struct xpc_channel *ch) + + atomic_inc(&ch->n_on_msg_allocate_wq); + prepare_to_wait(&ch->msg_allocate_wq, &wait, TASK_INTERRUPTIBLE); +- ret = schedule_timeout(1); ++ ret = schedule_min_hrtimeout(); + finish_wait(&ch->msg_allocate_wq, &wait); + atomic_dec(&ch->n_on_msg_allocate_wq); + +diff --git a/drivers/net/caif/caif_hsi.c b/drivers/net/caif/caif_hsi.c +index bbb2575d4728..637757144221 100644 +--- a/drivers/net/caif/caif_hsi.c ++++ b/drivers/net/caif/caif_hsi.c +@@ -939,7 +939,7 @@ static void cfhsi_wake_down(struct work_struct *work) + break; + + set_current_state(TASK_INTERRUPTIBLE); +- schedule_timeout(1); ++ schedule_min_hrtimeout(); + retry--; + } + +diff --git a/drivers/net/can/usb/peak_usb/pcan_usb.c b/drivers/net/can/usb/peak_usb/pcan_usb.c +index d2539c95adb6..0c2f31a03ce9 100644 +--- a/drivers/net/can/usb/peak_usb/pcan_usb.c ++++ b/drivers/net/can/usb/peak_usb/pcan_usb.c +@@ -242,7 +242,7 @@ static int pcan_usb_write_mode(struct peak_usb_device *dev, u8 onoff) + } else { + /* the PCAN-USB needs time to init */ + set_current_state(TASK_INTERRUPTIBLE); +- schedule_timeout(msecs_to_jiffies(PCAN_USB_STARTUP_TIMEOUT)); ++ schedule_msec_hrtimeout((PCAN_USB_STARTUP_TIMEOUT)); + } + + return err; +diff --git a/drivers/net/usb/lan78xx.c b/drivers/net/usb/lan78xx.c +index c2a58f05b9a1..6ef94ef9b18e 100644 +--- a/drivers/net/usb/lan78xx.c ++++ b/drivers/net/usb/lan78xx.c +@@ -2678,7 +2678,7 @@ static void lan78xx_terminate_urbs(struct lan78xx_net *dev) + while (!skb_queue_empty(&dev->rxq) && + !skb_queue_empty(&dev->txq) && + !skb_queue_empty(&dev->done)) { +- schedule_timeout(msecs_to_jiffies(UNLINK_TIMEOUT_MS)); ++ schedule_msec_hrtimeout((UNLINK_TIMEOUT_MS)); + set_current_state(TASK_UNINTERRUPTIBLE); + netif_dbg(dev, ifdown, dev->net, + "waited for %d urb completions\n", temp); +diff --git a/drivers/net/usb/usbnet.c b/drivers/net/usb/usbnet.c +index 9ce6d30576dd..e53d42786ac9 100644 +--- a/drivers/net/usb/usbnet.c ++++ b/drivers/net/usb/usbnet.c +@@ -767,7 +767,7 @@ static void wait_skb_queue_empty(struct sk_buff_head *q) + spin_lock_irqsave(&q->lock, flags); + while (!skb_queue_empty(q)) { + spin_unlock_irqrestore(&q->lock, flags); +- schedule_timeout(msecs_to_jiffies(UNLINK_TIMEOUT_MS)); ++ schedule_msec_hrtimeout((UNLINK_TIMEOUT_MS)); + set_current_state(TASK_UNINTERRUPTIBLE); + spin_lock_irqsave(&q->lock, flags); + } +diff --git a/drivers/net/wireless/intel/ipw2x00/ipw2100.c b/drivers/net/wireless/intel/ipw2x00/ipw2100.c +index c4c83ab60cbc..2be961a750b8 100644 +--- a/drivers/net/wireless/intel/ipw2x00/ipw2100.c ++++ b/drivers/net/wireless/intel/ipw2x00/ipw2100.c +@@ -816,7 +816,7 @@ static int ipw2100_hw_send_command(struct ipw2100_priv *priv, + * doesn't seem to have as many firmware restart cycles... + * + * As a test, we're sticking in a 1/100s delay here */ +- schedule_timeout_uninterruptible(msecs_to_jiffies(10)); ++ schedule_msec_hrtimeout_uninterruptible((10)); + + return 0; + +@@ -1267,7 +1267,7 @@ static int ipw2100_start_adapter(struct ipw2100_priv *priv) + IPW_DEBUG_FW("Waiting for f/w initialization to complete...\n"); + i = 5000; + do { +- schedule_timeout_uninterruptible(msecs_to_jiffies(40)); ++ schedule_msec_hrtimeout_uninterruptible((40)); + /* Todo... wait for sync command ... */ + + read_register(priv->net_dev, IPW_REG_INTA, &inta); +diff --git a/drivers/parport/ieee1284.c b/drivers/parport/ieee1284.c +index 90fb73575495..c94048b048a5 100644 +--- a/drivers/parport/ieee1284.c ++++ b/drivers/parport/ieee1284.c +@@ -208,7 +208,7 @@ int parport_wait_peripheral(struct parport *port, + /* parport_wait_event didn't time out, but the + * peripheral wasn't actually ready either. + * Wait for another 10ms. */ +- schedule_timeout_interruptible(msecs_to_jiffies(10)); ++ schedule_msec_hrtimeout_interruptible((10)); + } + } + +diff --git a/drivers/parport/ieee1284_ops.c b/drivers/parport/ieee1284_ops.c +index 5d41dda6da4e..34705f6b423f 100644 +--- a/drivers/parport/ieee1284_ops.c ++++ b/drivers/parport/ieee1284_ops.c +@@ -537,7 +537,7 @@ size_t parport_ieee1284_ecp_read_data (struct parport *port, + /* Yield the port for a while. */ + if (count && dev->port->irq != PARPORT_IRQ_NONE) { + parport_release (dev); +- schedule_timeout_interruptible(msecs_to_jiffies(40)); ++ schedule_msec_hrtimeout_interruptible((40)); + parport_claim_or_block (dev); + } + else +diff --git a/drivers/platform/x86/intel_ips.c b/drivers/platform/x86/intel_ips.c +index bffe548187ee..c2918ee3e100 100644 +--- a/drivers/platform/x86/intel_ips.c ++++ b/drivers/platform/x86/intel_ips.c +@@ -798,7 +798,7 @@ static int ips_adjust(void *data) + ips_gpu_lower(ips); + + sleep: +- schedule_timeout_interruptible(msecs_to_jiffies(IPS_ADJUST_PERIOD)); ++ schedule_msec_hrtimeout_interruptible((IPS_ADJUST_PERIOD)); + } while (!kthread_should_stop()); + + dev_dbg(ips->dev, "ips-adjust thread stopped\n"); +@@ -974,7 +974,7 @@ static int ips_monitor(void *data) + seqno_timestamp = get_jiffies_64(); + + old_cpu_power = thm_readl(THM_CEC); +- schedule_timeout_interruptible(msecs_to_jiffies(IPS_SAMPLE_PERIOD)); ++ schedule_msec_hrtimeout_interruptible((IPS_SAMPLE_PERIOD)); + + /* Collect an initial average */ + for (i = 0; i < IPS_SAMPLE_COUNT; i++) { +@@ -1001,7 +1001,7 @@ static int ips_monitor(void *data) + mchp_samples[i] = mchp; + } + +- schedule_timeout_interruptible(msecs_to_jiffies(IPS_SAMPLE_PERIOD)); ++ schedule_msec_hrtimeout_interruptible((IPS_SAMPLE_PERIOD)); + if (kthread_should_stop()) + break; + } +@@ -1028,7 +1028,7 @@ static int ips_monitor(void *data) + * us to reduce the sample frequency if the CPU and GPU are idle. + */ + old_cpu_power = thm_readl(THM_CEC); +- schedule_timeout_interruptible(msecs_to_jiffies(IPS_SAMPLE_PERIOD)); ++ schedule_msec_hrtimeout_interruptible((IPS_SAMPLE_PERIOD)); + last_sample_period = IPS_SAMPLE_PERIOD; + + timer_setup(&ips->timer, monitor_timeout, TIMER_DEFERRABLE); +diff --git a/drivers/rtc/rtc-wm8350.c b/drivers/rtc/rtc-wm8350.c +index 2018614f258f..fc19b312c345 100644 +--- a/drivers/rtc/rtc-wm8350.c ++++ b/drivers/rtc/rtc-wm8350.c +@@ -114,7 +114,7 @@ static int wm8350_rtc_settime(struct device *dev, struct rtc_time *tm) + /* Wait until confirmation of stopping */ + do { + rtc_ctrl = wm8350_reg_read(wm8350, WM8350_RTC_TIME_CONTROL); +- schedule_timeout_uninterruptible(msecs_to_jiffies(1)); ++ schedule_msec_hrtimeout_uninterruptible((1)); + } while (--retries && !(rtc_ctrl & WM8350_RTC_STS)); + + if (!retries) { +@@ -197,7 +197,7 @@ static int wm8350_rtc_stop_alarm(struct wm8350 *wm8350) + /* Wait until confirmation of stopping */ + do { + rtc_ctrl = wm8350_reg_read(wm8350, WM8350_RTC_TIME_CONTROL); +- schedule_timeout_uninterruptible(msecs_to_jiffies(1)); ++ schedule_msec_hrtimeout_uninterruptible((1)); + } while (retries-- && !(rtc_ctrl & WM8350_RTC_ALMSTS)); + + if (!(rtc_ctrl & WM8350_RTC_ALMSTS)) +@@ -220,7 +220,7 @@ static int wm8350_rtc_start_alarm(struct wm8350 *wm8350) + /* Wait until confirmation */ + do { + rtc_ctrl = wm8350_reg_read(wm8350, WM8350_RTC_TIME_CONTROL); +- schedule_timeout_uninterruptible(msecs_to_jiffies(1)); ++ schedule_msec_hrtimeout_uninterruptible((1)); + } while (retries-- && rtc_ctrl & WM8350_RTC_ALMSTS); + + if (rtc_ctrl & WM8350_RTC_ALMSTS) +diff --git a/drivers/scsi/fnic/fnic_scsi.c b/drivers/scsi/fnic/fnic_scsi.c +index b60795893994..d2d05691dbd2 100644 +--- a/drivers/scsi/fnic/fnic_scsi.c ++++ b/drivers/scsi/fnic/fnic_scsi.c +@@ -216,7 +216,7 @@ int fnic_fw_reset_handler(struct fnic *fnic) + + /* wait for io cmpl */ + while (atomic_read(&fnic->in_flight)) +- schedule_timeout(msecs_to_jiffies(1)); ++ schedule_msec_hrtimeout((1)); + + spin_lock_irqsave(&fnic->wq_copy_lock[0], flags); + +@@ -2277,7 +2277,7 @@ static int fnic_clean_pending_aborts(struct fnic *fnic, + } + } + +- schedule_timeout(msecs_to_jiffies(2 * fnic->config.ed_tov)); ++ schedule_msec_hrtimeout((2 * fnic->config.ed_tov)); + + /* walk again to check, if IOs are still pending in fw */ + if (fnic_is_abts_pending(fnic, lr_sc)) +diff --git a/drivers/scsi/lpfc/lpfc_scsi.c b/drivers/scsi/lpfc/lpfc_scsi.c +index b138d9fee675..63cacc21aa58 100644 +--- a/drivers/scsi/lpfc/lpfc_scsi.c ++++ b/drivers/scsi/lpfc/lpfc_scsi.c +@@ -5181,7 +5181,7 @@ lpfc_reset_flush_io_context(struct lpfc_vport *vport, uint16_t tgt_id, + tgt_id, lun_id, context); + later = msecs_to_jiffies(2 * vport->cfg_devloss_tmo * 1000) + jiffies; + while (time_after(later, jiffies) && cnt) { +- schedule_timeout_uninterruptible(msecs_to_jiffies(20)); ++ schedule_msec_hrtimeout_uninterruptible((20)); + cnt = lpfc_sli_sum_iocb(vport, tgt_id, lun_id, context); + } + if (cnt) { +diff --git a/drivers/scsi/snic/snic_scsi.c b/drivers/scsi/snic/snic_scsi.c +index b3650c989ed4..7ed1fb285754 100644 +--- a/drivers/scsi/snic/snic_scsi.c ++++ b/drivers/scsi/snic/snic_scsi.c +@@ -2353,7 +2353,7 @@ snic_reset(struct Scsi_Host *shost, struct scsi_cmnd *sc) + + /* Wait for all the IOs that are entered in Qcmd */ + while (atomic_read(&snic->ios_inflight)) +- schedule_timeout(msecs_to_jiffies(1)); ++ schedule_msec_hrtimeout((1)); + + ret = snic_issue_hba_reset(snic, sc); + if (ret) { +diff --git a/drivers/staging/comedi/drivers/ni_mio_common.c b/drivers/staging/comedi/drivers/ni_mio_common.c +index f98e3ae27bff..0741c8352a6d 100644 +--- a/drivers/staging/comedi/drivers/ni_mio_common.c ++++ b/drivers/staging/comedi/drivers/ni_mio_common.c +@@ -4742,7 +4742,7 @@ static int cs5529_wait_for_idle(struct comedi_device *dev) + if ((status & NI67XX_CAL_STATUS_BUSY) == 0) + break; + set_current_state(TASK_INTERRUPTIBLE); +- if (schedule_timeout(1)) ++ if (schedule_min_hrtimeout()) + return -EIO; + } + if (i == timeout) { +diff --git a/drivers/staging/rts5208/rtsx.c b/drivers/staging/rts5208/rtsx.c +index cb95ad6fa4f9..d352058e0336 100644 +--- a/drivers/staging/rts5208/rtsx.c ++++ b/drivers/staging/rts5208/rtsx.c +@@ -490,7 +490,7 @@ static int rtsx_polling_thread(void *__dev) + + for (;;) { + set_current_state(TASK_INTERRUPTIBLE); +- schedule_timeout(msecs_to_jiffies(POLLING_INTERVAL)); ++ schedule_msec_hrtimeout((POLLING_INTERVAL)); + + /* lock the device pointers */ + mutex_lock(&dev->dev_mutex); +diff --git a/drivers/staging/speakup/speakup_acntpc.c b/drivers/staging/speakup/speakup_acntpc.c +index c94328a5bd4a..6e7d4671aa69 100644 +--- a/drivers/staging/speakup/speakup_acntpc.c ++++ b/drivers/staging/speakup/speakup_acntpc.c +@@ -198,7 +198,7 @@ static void do_catch_up(struct spk_synth *synth) + full_time_val = full_time->u.n.value; + spin_unlock_irqrestore(&speakup_info.spinlock, flags); + if (synth_full()) { +- schedule_timeout(msecs_to_jiffies(full_time_val)); ++ schedule_msec_hrtimeout((full_time_val)); + continue; + } + set_current_state(TASK_RUNNING); +@@ -226,7 +226,7 @@ static void do_catch_up(struct spk_synth *synth) + jiffy_delta_val = jiffy_delta->u.n.value; + delay_time_val = delay_time->u.n.value; + spin_unlock_irqrestore(&speakup_info.spinlock, flags); +- schedule_timeout(msecs_to_jiffies(delay_time_val)); ++ schedule_msec_hrtimeout(delay_time_val); + jiff_max = jiffies + jiffy_delta_val; + } + } +diff --git a/drivers/staging/speakup/speakup_apollo.c b/drivers/staging/speakup/speakup_apollo.c +index 0877b4044c28..627102d048c1 100644 +--- a/drivers/staging/speakup/speakup_apollo.c ++++ b/drivers/staging/speakup/speakup_apollo.c +@@ -165,7 +165,7 @@ static void do_catch_up(struct spk_synth *synth) + if (!synth->io_ops->synth_out(synth, ch)) { + synth->io_ops->tiocmset(0, UART_MCR_RTS); + synth->io_ops->tiocmset(UART_MCR_RTS, 0); +- schedule_timeout(msecs_to_jiffies(full_time_val)); ++ schedule_msec_hrtimeout(full_time_val); + continue; + } + if (time_after_eq(jiffies, jiff_max) && (ch == SPACE)) { +diff --git a/drivers/staging/speakup/speakup_decext.c b/drivers/staging/speakup/speakup_decext.c +index ddbb7e97d118..f9502addc765 100644 +--- a/drivers/staging/speakup/speakup_decext.c ++++ b/drivers/staging/speakup/speakup_decext.c +@@ -176,7 +176,7 @@ static void do_catch_up(struct spk_synth *synth) + if (ch == '\n') + ch = 0x0D; + if (synth_full() || !synth->io_ops->synth_out(synth, ch)) { +- schedule_timeout(msecs_to_jiffies(delay_time_val)); ++ schedule_msec_hrtimeout(delay_time_val); + continue; + } + set_current_state(TASK_RUNNING); +diff --git a/drivers/staging/speakup/speakup_decpc.c b/drivers/staging/speakup/speakup_decpc.c +index 798c42dfa16c..d85b41db67a3 100644 +--- a/drivers/staging/speakup/speakup_decpc.c ++++ b/drivers/staging/speakup/speakup_decpc.c +@@ -394,7 +394,7 @@ static void do_catch_up(struct spk_synth *synth) + if (ch == '\n') + ch = 0x0D; + if (dt_sendchar(ch)) { +- schedule_timeout(msecs_to_jiffies(delay_time_val)); ++ schedule_msec_hrtimeout((delay_time_val)); + continue; + } + set_current_state(TASK_RUNNING); +diff --git a/drivers/staging/speakup/speakup_dectlk.c b/drivers/staging/speakup/speakup_dectlk.c +index dccb4ea29d37..8ecead307d04 100644 +--- a/drivers/staging/speakup/speakup_dectlk.c ++++ b/drivers/staging/speakup/speakup_dectlk.c +@@ -244,7 +244,7 @@ static void do_catch_up(struct spk_synth *synth) + if (ch == '\n') + ch = 0x0D; + if (synth_full_val || !synth->io_ops->synth_out(synth, ch)) { +- schedule_timeout(msecs_to_jiffies(delay_time_val)); ++ schedule_msec_hrtimeout(delay_time_val); + continue; + } + set_current_state(TASK_RUNNING); +diff --git a/drivers/staging/speakup/speakup_dtlk.c b/drivers/staging/speakup/speakup_dtlk.c +index dbebed0eeeec..6d83c13ca4a6 100644 +--- a/drivers/staging/speakup/speakup_dtlk.c ++++ b/drivers/staging/speakup/speakup_dtlk.c +@@ -211,7 +211,7 @@ static void do_catch_up(struct spk_synth *synth) + delay_time_val = delay_time->u.n.value; + spin_unlock_irqrestore(&speakup_info.spinlock, flags); + if (synth_full()) { +- schedule_timeout(msecs_to_jiffies(delay_time_val)); ++ schedule_msec_hrtimeout((delay_time_val)); + continue; + } + set_current_state(TASK_RUNNING); +@@ -227,7 +227,7 @@ static void do_catch_up(struct spk_synth *synth) + delay_time_val = delay_time->u.n.value; + jiffy_delta_val = jiffy_delta->u.n.value; + spin_unlock_irqrestore(&speakup_info.spinlock, flags); +- schedule_timeout(msecs_to_jiffies(delay_time_val)); ++ schedule_msec_hrtimeout((delay_time_val)); + jiff_max = jiffies + jiffy_delta_val; + } + } +diff --git a/drivers/staging/speakup/speakup_keypc.c b/drivers/staging/speakup/speakup_keypc.c +index 414827e888fc..cb31c9176daa 100644 +--- a/drivers/staging/speakup/speakup_keypc.c ++++ b/drivers/staging/speakup/speakup_keypc.c +@@ -199,7 +199,7 @@ static void do_catch_up(struct spk_synth *synth) + full_time_val = full_time->u.n.value; + spin_unlock_irqrestore(&speakup_info.spinlock, flags); + if (synth_full()) { +- schedule_timeout(msecs_to_jiffies(full_time_val)); ++ schedule_msec_hrtimeout((full_time_val)); + continue; + } + set_current_state(TASK_RUNNING); +@@ -232,7 +232,7 @@ static void do_catch_up(struct spk_synth *synth) + jiffy_delta_val = jiffy_delta->u.n.value; + delay_time_val = delay_time->u.n.value; + spin_unlock_irqrestore(&speakup_info.spinlock, flags); +- schedule_timeout(msecs_to_jiffies(delay_time_val)); ++ schedule_msec_hrtimeout(delay_time_val); + jiff_max = jiffies + jiffy_delta_val; + } + } +diff --git a/drivers/staging/speakup/synth.c b/drivers/staging/speakup/synth.c +index 3568bfb89912..0a80b3b098b2 100644 +--- a/drivers/staging/speakup/synth.c ++++ b/drivers/staging/speakup/synth.c +@@ -93,12 +93,8 @@ static void _spk_do_catch_up(struct spk_synth *synth, int unicode) + spin_unlock_irqrestore(&speakup_info.spinlock, flags); + if (ch == '\n') + ch = synth->procspeech; +- if (unicode) +- ret = synth->io_ops->synth_out_unicode(synth, ch); +- else +- ret = synth->io_ops->synth_out(synth, ch); +- if (!ret) { +- schedule_timeout(msecs_to_jiffies(full_time_val)); ++ if (!synth->io_ops->synth_out(synth, ch)) { ++ schedule_msec_hrtimeout(full_time_val); + continue; + } + if (time_after_eq(jiffies, jiff_max) && (ch == SPACE)) { +@@ -108,11 +104,9 @@ static void _spk_do_catch_up(struct spk_synth *synth, int unicode) + full_time_val = full_time->u.n.value; + spin_unlock_irqrestore(&speakup_info.spinlock, flags); + if (synth->io_ops->synth_out(synth, synth->procspeech)) +- schedule_timeout( +- msecs_to_jiffies(delay_time_val)); ++ schedule_msec_hrtimeout(delay_time_val); + else +- schedule_timeout( +- msecs_to_jiffies(full_time_val)); ++ schedule_msec_hrtimeout(full_time_val); + jiff_max = jiffies + jiffy_delta_val; + } + set_current_state(TASK_RUNNING); +diff --git a/drivers/staging/unisys/visornic/visornic_main.c b/drivers/staging/unisys/visornic/visornic_main.c +index 1d1440d43002..52fe89ae1d9d 100644 +--- a/drivers/staging/unisys/visornic/visornic_main.c ++++ b/drivers/staging/unisys/visornic/visornic_main.c +@@ -549,7 +549,7 @@ static int visornic_disable_with_timeout(struct net_device *netdev, + } + set_current_state(TASK_INTERRUPTIBLE); + spin_unlock_irqrestore(&devdata->priv_lock, flags); +- wait += schedule_timeout(msecs_to_jiffies(10)); ++ wait += schedule_msec_hrtimeout((10)); + spin_lock_irqsave(&devdata->priv_lock, flags); + } + +@@ -560,7 +560,7 @@ static int visornic_disable_with_timeout(struct net_device *netdev, + while (1) { + set_current_state(TASK_INTERRUPTIBLE); + spin_unlock_irqrestore(&devdata->priv_lock, flags); +- schedule_timeout(msecs_to_jiffies(10)); ++ schedule_msec_hrtimeout((10)); + spin_lock_irqsave(&devdata->priv_lock, flags); + if (atomic_read(&devdata->usage)) + break; +@@ -714,7 +714,7 @@ static int visornic_enable_with_timeout(struct net_device *netdev, + } + set_current_state(TASK_INTERRUPTIBLE); + spin_unlock_irqrestore(&devdata->priv_lock, flags); +- wait += schedule_timeout(msecs_to_jiffies(10)); ++ wait += schedule_msec_hrtimeout((10)); + spin_lock_irqsave(&devdata->priv_lock, flags); + } + +diff --git a/drivers/video/fbdev/omap/hwa742.c b/drivers/video/fbdev/omap/hwa742.c +index cfe63932f825..71c00ef772a3 100644 +--- a/drivers/video/fbdev/omap/hwa742.c ++++ b/drivers/video/fbdev/omap/hwa742.c +@@ -913,7 +913,7 @@ static void hwa742_resume(void) + if (hwa742_read_reg(HWA742_PLL_DIV_REG) & (1 << 7)) + break; + set_current_state(TASK_UNINTERRUPTIBLE); +- schedule_timeout(msecs_to_jiffies(5)); ++ schedule_msec_hrtimeout((5)); + } + hwa742_set_update_mode(hwa742.update_mode_before_suspend); + } +diff --git a/drivers/video/fbdev/pxafb.c b/drivers/video/fbdev/pxafb.c +index f70c9f79622e..0b363eaee24f 100644 +--- a/drivers/video/fbdev/pxafb.c ++++ b/drivers/video/fbdev/pxafb.c +@@ -1287,7 +1287,7 @@ static int pxafb_smart_thread(void *arg) + mutex_unlock(&fbi->ctrlr_lock); + + set_current_state(TASK_INTERRUPTIBLE); +- schedule_timeout(msecs_to_jiffies(30)); ++ schedule_msec_hrtimeout((30)); + } + + pr_debug("%s(): task ending\n", __func__); +diff --git a/fs/btrfs/inode-map.c b/fs/btrfs/inode-map.c +index 37345fb6191d..3874c17d1bc5 100644 +--- a/fs/btrfs/inode-map.c ++++ b/fs/btrfs/inode-map.c +@@ -91,7 +91,7 @@ static int caching_kthread(void *data) + btrfs_release_path(path); + root->ino_cache_progress = last; + up_read(&fs_info->commit_root_sem); +- schedule_timeout(1); ++ schedule_min_hrtimeout(); + goto again; + } else + continue; +diff --git a/fs/proc/base.c b/fs/proc/base.c +index ebea9501afb8..51c9346a69fe 100644 +--- a/fs/proc/base.c ++++ b/fs/proc/base.c +@@ -477,7 +477,7 @@ static int proc_pid_schedstat(struct seq_file *m, struct pid_namespace *ns, + seq_puts(m, "0 0 0\n"); + else + seq_printf(m, "%llu %llu %lu\n", +- (unsigned long long)task->se.sum_exec_runtime, ++ (unsigned long long)tsk_seruntime(task), + (unsigned long long)task->sched_info.run_delay, + task->sched_info.pcount); + +diff --git a/include/linux/freezer.h b/include/linux/freezer.h +index 21f5aa0b217f..ee9b46394fdf 100644 +--- a/include/linux/freezer.h ++++ b/include/linux/freezer.h +@@ -297,6 +297,7 @@ static inline void set_freezable(void) {} + #define wait_event_freezekillable_unsafe(wq, condition) \ + wait_event_killable(wq, condition) + ++#define pm_freezing (false) + #endif /* !CONFIG_FREEZER */ + + #endif /* FREEZER_H_INCLUDED */ +diff --git a/include/linux/init_task.h b/include/linux/init_task.h +index 2c620d7ac432..73417df5daa2 100644 +--- a/include/linux/init_task.h ++++ b/include/linux/init_task.h +@@ -36,7 +36,11 @@ extern struct cred init_cred; + #define INIT_PREV_CPUTIME(x) + #endif + ++#ifdef CONFIG_SCHED_MUQSS ++#define INIT_TASK_COMM "MuQSS" ++#else + #define INIT_TASK_COMM "swapper" ++#endif + + /* Attach to the init_task data structure for proper alignment */ + #ifdef CONFIG_ARCH_TASK_STRUCT_ON_STACK +diff --git a/include/linux/ioprio.h b/include/linux/ioprio.h +index e9bfe6972aed..16ba1c7e5bde 100644 +--- a/include/linux/ioprio.h ++++ b/include/linux/ioprio.h +@@ -53,6 +53,8 @@ enum { + */ + static inline int task_nice_ioprio(struct task_struct *task) + { ++ if (iso_task(task)) ++ return 0; + return (task_nice(task) + 20) / 5; + } + +diff --git a/include/linux/sched.h b/include/linux/sched.h +index 716ad1d8d95e..f5fe682bed8a 100644 +--- a/include/linux/sched.h ++++ b/include/linux/sched.h +@@ -31,6 +31,9 @@ + #include + #include + #include ++#ifdef CONFIG_SCHED_MUQSS ++#include ++#endif + + /* task_struct member predeclarations (sorted alphabetically): */ + struct audit_context; +@@ -214,13 +217,40 @@ struct task_group; + + extern void scheduler_tick(void); + +-#define MAX_SCHEDULE_TIMEOUT LONG_MAX +- ++#define MAX_SCHEDULE_TIMEOUT LONG_MAX + extern long schedule_timeout(long timeout); + extern long schedule_timeout_interruptible(long timeout); + extern long schedule_timeout_killable(long timeout); + extern long schedule_timeout_uninterruptible(long timeout); + extern long schedule_timeout_idle(long timeout); ++ ++#ifdef CONFIG_HIGH_RES_TIMERS ++extern long schedule_msec_hrtimeout(long timeout); ++extern long schedule_min_hrtimeout(void); ++extern long schedule_msec_hrtimeout_interruptible(long timeout); ++extern long schedule_msec_hrtimeout_uninterruptible(long timeout); ++#else ++static inline long schedule_msec_hrtimeout(long timeout) ++{ ++ return schedule_timeout(msecs_to_jiffies(timeout)); ++} ++ ++static inline long schedule_min_hrtimeout(void) ++{ ++ return schedule_timeout(1); ++} ++ ++static inline long schedule_msec_hrtimeout_interruptible(long timeout) ++{ ++ return schedule_timeout_interruptible(msecs_to_jiffies(timeout)); ++} ++ ++static inline long schedule_msec_hrtimeout_uninterruptible(long timeout) ++{ ++ return schedule_timeout_uninterruptible(msecs_to_jiffies(timeout)); ++} ++#endif ++ + asmlinkage void schedule(void); + extern void schedule_preempt_disabled(void); + asmlinkage void preempt_schedule_irq(void); +@@ -649,9 +679,11 @@ struct task_struct { + unsigned int flags; + unsigned int ptrace; + ++#if defined(CONFIG_SMP) || defined(CONFIG_SCHED_MUQSS) ++ int on_cpu; ++#endif + #ifdef CONFIG_SMP + struct llist_node wake_entry; +- int on_cpu; + #ifdef CONFIG_THREAD_INFO_IN_TASK + /* Current CPU: */ + unsigned int cpu; +@@ -676,10 +708,25 @@ struct task_struct { + int static_prio; + int normal_prio; + unsigned int rt_priority; ++#ifdef CONFIG_SCHED_MUQSS ++ int time_slice; ++ u64 deadline; ++ skiplist_node node; /* Skip list node */ ++ u64 last_ran; ++ u64 sched_time; /* sched_clock time spent running */ ++#ifdef CONFIG_SMT_NICE ++ int smt_bias; /* Policy/nice level bias across smt siblings */ ++#endif ++#ifdef CONFIG_HOTPLUG_CPU ++ bool zerobound; /* Bound to CPU0 for hotplug */ ++#endif ++ unsigned long rt_timeout; ++#else /* CONFIG_SCHED_MUQSS */ + + const struct sched_class *sched_class; + struct sched_entity se; + struct sched_rt_entity rt; ++#endif + #ifdef CONFIG_CGROUP_SCHED + struct task_group *sched_task_group; + #endif +@@ -844,6 +891,10 @@ struct task_struct { + #ifdef CONFIG_ARCH_HAS_SCALED_CPUTIME + u64 utimescaled; + u64 stimescaled; ++#endif ++#ifdef CONFIG_SCHED_MUQSS ++ /* Unbanked cpu time */ ++ unsigned long utime_ns, stime_ns; + #endif + u64 gtime; + struct prev_cputime prev_cputime; +@@ -1298,6 +1349,40 @@ struct task_struct { + */ + }; + ++#ifdef CONFIG_SCHED_MUQSS ++#define tsk_seruntime(t) ((t)->sched_time) ++#define tsk_rttimeout(t) ((t)->rt_timeout) ++ ++static inline void tsk_cpus_current(struct task_struct *p) ++{ ++} ++ ++void print_scheduler_version(void); ++ ++static inline bool iso_task(struct task_struct *p) ++{ ++ return (p->policy == SCHED_ISO); ++} ++#else /* CFS */ ++#define tsk_seruntime(t) ((t)->se.sum_exec_runtime) ++#define tsk_rttimeout(t) ((t)->rt.timeout) ++ ++static inline void tsk_cpus_current(struct task_struct *p) ++{ ++ p->nr_cpus_allowed = current->nr_cpus_allowed; ++} ++ ++static inline void print_scheduler_version(void) ++{ ++ printk(KERN_INFO "CFS CPU scheduler.\n"); ++} ++ ++static inline bool iso_task(struct task_struct *p) ++{ ++ return false; ++} ++#endif /* CONFIG_SCHED_MUQSS */ ++ + static inline struct pid *task_pid(struct task_struct *task) + { + return task->thread_pid; +diff --git a/include/linux/sched/deadline.h b/include/linux/sched/deadline.h +index 1aff00b65f3c..73d6319a856a 100644 +--- a/include/linux/sched/deadline.h ++++ b/include/linux/sched/deadline.h +@@ -28,7 +28,16 @@ static inline bool dl_time_before(u64 a, u64 b) + #ifdef CONFIG_SMP + + struct root_domain; ++#ifdef CONFIG_SCHED_MUQSS ++static inline void dl_clear_root_domain(struct root_domain *rd) ++{ ++} ++static inline void dl_add_task_root_domain(struct task_struct *p) ++{ ++} ++#else /* CONFIG_SCHED_MUQSS */ + extern void dl_add_task_root_domain(struct task_struct *p); + extern void dl_clear_root_domain(struct root_domain *rd); ++#endif /* CONFIG_SCHED_MUQSS */ + + #endif /* CONFIG_SMP */ +diff --git a/include/linux/sched/nohz.h b/include/linux/sched/nohz.h +index 1abe91ff6e4a..20ba383562b0 100644 +--- a/include/linux/sched/nohz.h ++++ b/include/linux/sched/nohz.h +@@ -13,7 +13,7 @@ extern int get_nohz_timer_target(void); + static inline void nohz_balance_enter_idle(int cpu) { } + #endif + +-#ifdef CONFIG_NO_HZ_COMMON ++#if defined(CONFIG_NO_HZ_COMMON) && !defined(CONFIG_SCHED_MUQSS) + void calc_load_nohz_start(void); + void calc_load_nohz_stop(void); + #else +diff --git a/include/linux/sched/prio.h b/include/linux/sched/prio.h +index 7d64feafc408..43c9d9e50c09 100644 +--- a/include/linux/sched/prio.h ++++ b/include/linux/sched/prio.h +@@ -20,8 +20,20 @@ + */ + + #define MAX_USER_RT_PRIO 100 ++ ++#ifdef CONFIG_SCHED_MUQSS ++/* Note different MAX_RT_PRIO */ ++#define MAX_RT_PRIO (MAX_USER_RT_PRIO + 1) ++ ++#define ISO_PRIO (MAX_RT_PRIO) ++#define NORMAL_PRIO (MAX_RT_PRIO + 1) ++#define IDLE_PRIO (MAX_RT_PRIO + 2) ++#define PRIO_LIMIT ((IDLE_PRIO) + 1) ++#else /* CONFIG_SCHED_MUQSS */ + #define MAX_RT_PRIO MAX_USER_RT_PRIO + ++#endif /* CONFIG_SCHED_MUQSS */ ++ + #define MAX_PRIO (MAX_RT_PRIO + NICE_WIDTH) + #define DEFAULT_PRIO (MAX_RT_PRIO + NICE_WIDTH / 2) + +diff --git a/include/linux/sched/rt.h b/include/linux/sched/rt.h +index e5af028c08b4..010b2244e0b6 100644 +--- a/include/linux/sched/rt.h ++++ b/include/linux/sched/rt.h +@@ -24,8 +24,10 @@ static inline bool task_is_realtime(struct task_struct *tsk) + + if (policy == SCHED_FIFO || policy == SCHED_RR) + return true; ++#ifndef CONFIG_SCHED_MUQSS + if (policy == SCHED_DEADLINE) + return true; ++#endif + return false; + } + +diff --git a/include/linux/sched/task.h b/include/linux/sched/task.h +index f1879884238e..2326bea2d6e7 100644 +--- a/include/linux/sched/task.h ++++ b/include/linux/sched/task.h +@@ -102,7 +102,7 @@ extern long kernel_wait4(pid_t, int __user *, int, struct rusage *); + extern void free_task(struct task_struct *tsk); + + /* sched_exec is called by processes performing an exec */ +-#ifdef CONFIG_SMP ++#if defined(CONFIG_SMP) && !defined(CONFIG_SCHED_MUQSS) + extern void sched_exec(void); + #else + #define sched_exec() {} +diff --git a/include/linux/skip_list.h b/include/linux/skip_list.h +new file mode 100644 +index 000000000000..d4be84ba273b +--- /dev/null ++++ b/include/linux/skip_list.h +@@ -0,0 +1,33 @@ ++#ifndef _LINUX_SKIP_LISTS_H ++#define _LINUX_SKIP_LISTS_H ++typedef u64 keyType; ++typedef void *valueType; ++ ++typedef struct nodeStructure skiplist_node; ++ ++struct nodeStructure { ++ int level; /* Levels in this structure */ ++ keyType key; ++ valueType value; ++ skiplist_node *next[8]; ++ skiplist_node *prev[8]; ++}; ++ ++typedef struct listStructure { ++ int entries; ++ int level; /* Maximum level of the list ++ (1 more than the number of levels in the list) */ ++ skiplist_node *header; /* pointer to header */ ++} skiplist; ++ ++void skiplist_init(skiplist_node *slnode); ++skiplist *new_skiplist(skiplist_node *slnode); ++void free_skiplist(skiplist *l); ++void skiplist_node_init(skiplist_node *node); ++void skiplist_insert(skiplist *l, skiplist_node *node, keyType key, valueType value, unsigned int randseed); ++void skiplist_delete(skiplist *l, skiplist_node *node); ++ ++static inline bool skiplist_node_empty(skiplist_node *node) { ++ return (!node->next[0]); ++} ++#endif /* _LINUX_SKIP_LISTS_H */ +diff --git a/include/uapi/linux/sched.h b/include/uapi/linux/sched.h +index 4a0217832464..2a7f7f3695f7 100644 +--- a/include/uapi/linux/sched.h ++++ b/include/uapi/linux/sched.h +@@ -104,9 +104,16 @@ struct clone_args { + #define SCHED_FIFO 1 + #define SCHED_RR 2 + #define SCHED_BATCH 3 +-/* SCHED_ISO: reserved but not implemented yet */ ++/* SCHED_ISO: Implemented on MuQSS only */ + #define SCHED_IDLE 5 ++#ifdef CONFIG_SCHED_MUQSS ++#define SCHED_ISO 4 ++#define SCHED_IDLEPRIO SCHED_IDLE ++#define SCHED_MAX (SCHED_IDLEPRIO) ++#define SCHED_RANGE(policy) ((policy) <= SCHED_MAX) ++#else /* CONFIG_SCHED_MUQSS */ + #define SCHED_DEADLINE 6 ++#endif /* CONFIG_SCHED_MUQSS */ + + /* Can be ORed in to make sure the process is reverted back to SCHED_NORMAL on fork */ + #define SCHED_RESET_ON_FORK 0x40000000 +diff --git a/init/Kconfig b/init/Kconfig +index a34064a031a5..dad51df6237f 100644 +--- a/init/Kconfig ++++ b/init/Kconfig +@@ -73,6 +73,18 @@ config THREAD_INFO_IN_TASK + + menu "General setup" + ++config SCHED_MUQSS ++ bool "MuQSS cpu scheduler" ++ select HIGH_RES_TIMERS ++ ---help--- ++ The Multiple Queue Skiplist Scheduler for excellent interactivity and ++ responsiveness on the desktop and highly scalable deterministic ++ low latency on any hardware. ++ ++ Say Y here. ++ default y ++ ++ + config BROKEN + bool + +@@ -786,6 +798,7 @@ config NUMA_BALANCING + depends on ARCH_SUPPORTS_NUMA_BALANCING + depends on !ARCH_WANT_NUMA_VARIABLE_LOCALITY + depends on SMP && NUMA && MIGRATION ++ depends on !SCHED_MUQSS + help + This option adds support for automatic NUMA aware memory/task placement. + The mechanism is quite primitive and is based on migrating memory when +@@ -885,9 +898,13 @@ menuconfig CGROUP_SCHED + help + This feature lets CPU scheduler recognize task groups and control CPU + bandwidth allocation to such task groups. It uses cgroups to group +- tasks. ++ tasks. In combination with MuQSS this is purely a STUB to create the ++ files associated with the CPU controller cgroup but most of the ++ controls do nothing. This is useful for working in environments and ++ with applications that will only work if this control group is ++ present. + +-if CGROUP_SCHED ++if CGROUP_SCHED && !SCHED_MUQSS + config FAIR_GROUP_SCHED + bool "Group scheduling for SCHED_OTHER" + depends on CGROUP_SCHED +@@ -1016,6 +1033,7 @@ config CGROUP_DEVICE + + config CGROUP_CPUACCT + bool "Simple CPU accounting controller" ++ depends on !SCHED_MUQSS + help + Provides a simple controller for monitoring the + total CPU consumed by the tasks in a cgroup. +@@ -1134,6 +1152,7 @@ config CHECKPOINT_RESTORE + + config SCHED_AUTOGROUP + bool "Automatic process group scheduling" ++ depends on !SCHED_MUQSS + select CGROUPS + select CGROUP_SCHED + select FAIR_GROUP_SCHED +diff --git a/init/init_task.c b/init/init_task.c +index 9e5cbe5eab7b..5c2bcbf25add 100644 +--- a/init/init_task.c ++++ b/init/init_task.c +@@ -66,9 +66,17 @@ struct task_struct init_task + .stack = init_stack, + .usage = REFCOUNT_INIT(2), + .flags = PF_KTHREAD, ++#ifdef CONFIG_SCHED_MUQSS ++ .prio = NORMAL_PRIO, ++ .static_prio = MAX_PRIO - 20, ++ .normal_prio = NORMAL_PRIO, ++ .deadline = 0, ++ .time_slice = 1000000, ++#else + .prio = MAX_PRIO - 20, + .static_prio = MAX_PRIO - 20, + .normal_prio = MAX_PRIO - 20, ++#endif + .policy = SCHED_NORMAL, + .cpus_ptr = &init_task.cpus_mask, + .cpus_mask = CPU_MASK_ALL, +@@ -78,6 +86,7 @@ struct task_struct init_task + .restart_block = { + .fn = do_no_restart_syscall, + }, ++#ifndef CONFIG_SCHED_MUQSS + .se = { + .group_node = LIST_HEAD_INIT(init_task.se.group_node), + }, +@@ -85,6 +94,7 @@ struct task_struct init_task + .run_list = LIST_HEAD_INIT(init_task.rt.run_list), + .time_slice = RR_TIMESLICE, + }, ++#endif + .tasks = LIST_HEAD_INIT(init_task.tasks), + #ifdef CONFIG_SMP + .pushable_tasks = PLIST_NODE_INIT(init_task.pushable_tasks, MAX_PRIO), +diff --git a/init/main.c b/init/main.c +index da1bc0b60a7d..2b323af46d7c 100644 +--- a/init/main.c ++++ b/init/main.c +@@ -1125,6 +1125,8 @@ static int __ref kernel_init(void *unused) + + rcu_end_inkernel_boot(); + ++ print_scheduler_version(); ++ + if (ramdisk_execute_command) { + ret = run_init_process(ramdisk_execute_command); + if (!ret) +diff --git a/kernel/Kconfig.MuQSS b/kernel/Kconfig.MuQSS +new file mode 100644 +index 000000000000..a6a58781ef91 +--- /dev/null ++++ b/kernel/Kconfig.MuQSS +@@ -0,0 +1,105 @@ ++choice ++ prompt "CPU scheduler runqueue sharing" ++ default RQ_MC if SCHED_MUQSS ++ default RQ_NONE ++ ++config RQ_NONE ++ bool "No sharing" ++ help ++ This is the default behaviour where the CPU scheduler has one runqueue ++ per CPU, whether it is a physical or logical CPU (hyperthread). ++ ++ This can still be enabled runtime with the boot parameter ++ rqshare=none ++ ++ If unsure, say N. ++ ++config RQ_SMT ++ bool "SMT (hyperthread) siblings" ++ depends on SCHED_SMT && SCHED_MUQSS ++ ++ help ++ With this option enabled, the CPU scheduler will have one runqueue ++ shared by SMT (hyperthread) siblings. As these logical cores share ++ one physical core, sharing the runqueue resource can lead to decreased ++ overhead, lower latency and higher throughput. ++ ++ This can still be enabled runtime with the boot parameter ++ rqshare=smt ++ ++ If unsure, say N. ++ ++config RQ_MC ++ bool "Multicore siblings" ++ depends on SCHED_MC && SCHED_MUQSS ++ help ++ With this option enabled, the CPU scheduler will have one runqueue ++ shared by multicore siblings in addition to any SMT siblings. ++ As these physical cores share caches, sharing the runqueue resource ++ will lead to lower latency, but its effects on overhead and throughput ++ are less predictable. As a general rule, 6 or fewer cores will likely ++ benefit from this, while larger CPUs will only derive a latency ++ benefit. If your workloads are primarily single threaded, this will ++ possibly worsen throughput. If you are only concerned about latency ++ then enable this regardless of how many cores you have. ++ ++ This can still be enabled runtime with the boot parameter ++ rqshare=mc ++ ++ If unsure, say Y. ++ ++config RQ_MC_LLC ++ bool "Multicore siblings (LLC)" ++ depends on SCHED_MC && SCHED_MUQSS ++ help ++ With this option enabled, the CPU scheduler will behave similarly as ++ with "Multicore siblings". ++ This option takes LLC cache into account when scheduling tasks. ++ Option may benefit CPUs with multiple LLC caches, such as Ryzen ++ and Xeon CPUs. ++ ++ This can still be enabled runtime with the boot parameter ++ rqshare=llc ++ ++ If unsure, say N. ++ ++config RQ_SMP ++ bool "Symmetric Multi-Processing" ++ depends on SMP && SCHED_MUQSS ++ help ++ With this option enabled, the CPU scheduler will have one runqueue ++ shared by all physical CPUs unless they are on separate NUMA nodes. ++ As physical CPUs usually do not share resources, sharing the runqueue ++ will normally worsen throughput but improve latency. If you only ++ care about latency enable this. ++ ++ This can still be enabled runtime with the boot parameter ++ rqshare=smp ++ ++ If unsure, say N. ++ ++config RQ_ALL ++ bool "NUMA" ++ depends on SMP && SCHED_MUQSS ++ help ++ With this option enabled, the CPU scheduler will have one runqueue ++ regardless of the architecture configuration, including across NUMA ++ nodes. This can substantially decrease throughput in NUMA ++ configurations, but light NUMA designs will not be dramatically ++ affected. This option should only be chosen if latency is the prime ++ concern. ++ ++ This can still be enabled runtime with the boot parameter ++ rqshare=all ++ ++ If unsure, say N. ++endchoice ++ ++config SHARERQ ++ int ++ default 0 if RQ_NONE ++ default 1 if RQ_SMT ++ default 2 if RQ_MC ++ default 3 if RQ_MC_LLC ++ default 4 if RQ_SMP ++ default 5 if RQ_ALL +diff --git a/kernel/Kconfig.hz b/kernel/Kconfig.hz +index 38ef6d06888e..89ed751ac4e4 100644 +--- a/kernel/Kconfig.hz ++++ b/kernel/Kconfig.hz +@@ -5,7 +5,8 @@ + + choice + prompt "Timer frequency" +- default HZ_250 ++ default HZ_100 if SCHED_MUQSS ++ default HZ_250_NODEF if !SCHED_MUQSS + help + Allows the configuration of the timer frequency. It is customary + to have the timer interrupt run at 1000 Hz but 100 Hz may be more +@@ -20,11 +21,18 @@ choice + config HZ_100 + bool "100 HZ" + help ++ 100 Hz is a suitable choice in combination with MuQSS which does ++ not rely on ticks for rescheduling interrupts, and is not Hz limited ++ for timeouts and sleeps from both the kernel and userspace. ++ This allows us to benefit from the lower overhead and higher ++ throughput of fewer timer ticks. ++ ++ Non-MuQSS kernels: + 100 Hz is a typical choice for servers, SMP and NUMA systems + with lots of processors that may show reduced performance if + too many timer interrupts are occurring. + +- config HZ_250 ++ config HZ_250_NODEF + bool "250 HZ" + help + 250 Hz is a good compromise choice allowing server performance +@@ -32,7 +40,10 @@ choice + on SMP and NUMA systems. If you are going to be using NTSC video + or multimedia, selected 300Hz instead. + +- config HZ_300 ++ 250 Hz is the default choice for the mainline scheduler but not ++ advantageous in combination with MuQSS. ++ ++ config HZ_300_NODEF + bool "300 HZ" + help + 300 Hz is a good compromise choice allowing server performance +@@ -40,7 +51,7 @@ choice + on SMP and NUMA systems and exactly dividing by both PAL and + NTSC frame rates for video and multimedia work. + +- config HZ_1000 ++ config HZ_1000_NODEF + bool "1000 HZ" + help + 1000 Hz is the preferred choice for desktop systems and other +@@ -51,9 +62,9 @@ endchoice + config HZ + int + default 100 if HZ_100 +- default 250 if HZ_250 +- default 300 if HZ_300 +- default 1000 if HZ_1000 ++ default 250 if HZ_250_NODEF ++ default 300 if HZ_300_NODEF ++ default 1000 if HZ_1000_NODEF + + config SCHED_HRTICK + def_bool HIGH_RES_TIMERS +diff --git a/kernel/Kconfig.preempt b/kernel/Kconfig.preempt +index bf82259cff96..d9438eb6f91c 100644 +--- a/kernel/Kconfig.preempt ++++ b/kernel/Kconfig.preempt +@@ -2,7 +2,7 @@ + + choice + prompt "Preemption Model" +- default PREEMPT_NONE ++ default PREEMPT + + config PREEMPT_NONE + bool "No Forced Preemption (Server)" +@@ -18,7 +18,7 @@ config PREEMPT_NONE + latencies. + + config PREEMPT_VOLUNTARY +- bool "Voluntary Kernel Preemption (Desktop)" ++ bool "Voluntary Kernel Preemption (Nothing)" + depends on !ARCH_NO_PREEMPT + help + This option reduces the latency of the kernel by adding more +@@ -33,7 +33,8 @@ config PREEMPT_VOLUNTARY + applications to run more 'smoothly' even when the system is + under load. + +- Select this if you are building a kernel for a desktop system. ++ Select this for no system in particular (choose Preemptible ++ instead on a desktop if you know what's good for you). + + config PREEMPT + bool "Preemptible Kernel (Low-Latency Desktop)" +diff --git a/kernel/Makefile b/kernel/Makefile +index f2cc0d118a0b..7c18c3eddd9b 100644 +--- a/kernel/Makefile ++++ b/kernel/Makefile +@@ -10,7 +10,7 @@ obj-y = fork.o exec_domain.o panic.o \ + extable.o params.o \ + kthread.o sys_ni.o nsproxy.o \ + notifier.o ksysfs.o cred.o reboot.o \ +- async.o range.o smpboot.o ucount.o ++ async.o range.o smpboot.o ucount.o skip_list.o + + obj-$(CONFIG_MODULES) += kmod.o + obj-$(CONFIG_MULTIUSER) += groups.o +diff --git a/kernel/delayacct.c b/kernel/delayacct.c +index 27725754ac99..769d773c7182 100644 +--- a/kernel/delayacct.c ++++ b/kernel/delayacct.c +@@ -106,7 +106,7 @@ int __delayacct_add_tsk(struct taskstats *d, struct task_struct *tsk) + */ + t1 = tsk->sched_info.pcount; + t2 = tsk->sched_info.run_delay; +- t3 = tsk->se.sum_exec_runtime; ++ t3 = tsk_seruntime(tsk); + + d->cpu_count += t1; + +diff --git a/kernel/exit.c b/kernel/exit.c +index 2833ffb0c211..37a1f8d73eee 100644 +--- a/kernel/exit.c ++++ b/kernel/exit.c +@@ -131,7 +131,7 @@ static void __exit_signal(struct task_struct *tsk) + sig->curr_target = next_thread(tsk); + } + +- add_device_randomness((const void*) &tsk->se.sum_exec_runtime, ++ add_device_randomness((const void*) &tsk_seruntime(tsk), + sizeof(unsigned long long)); + + /* +@@ -152,7 +152,7 @@ static void __exit_signal(struct task_struct *tsk) + sig->inblock += task_io_get_inblock(tsk); + sig->oublock += task_io_get_oublock(tsk); + task_io_accounting_add(&sig->ioac, &tsk->ioac); +- sig->sum_sched_runtime += tsk->se.sum_exec_runtime; ++ sig->sum_sched_runtime += tsk_seruntime(tsk); + sig->nr_threads--; + __unhash_process(tsk, group_dead); + write_sequnlock(&sig->stats_lock); +diff --git a/kernel/irq/Kconfig b/kernel/irq/Kconfig +index f92d9a687372..d17db0ff775f 100644 +--- a/kernel/irq/Kconfig ++++ b/kernel/irq/Kconfig +@@ -111,6 +111,23 @@ config GENERIC_IRQ_RESERVATION_MODE + config IRQ_FORCED_THREADING + bool + ++config FORCE_IRQ_THREADING ++ bool "Make IRQ threading compulsory" ++ depends on IRQ_FORCED_THREADING ++ default n ++ ---help--- ++ ++ Make IRQ threading mandatory for any IRQ handlers that support it ++ instead of being optional and requiring the threadirqs kernel ++ parameter. Instead they can be optionally disabled with the ++ nothreadirqs kernel parameter. ++ ++ Enabling this may make some architectures not boot with runqueue ++ sharing and MuQSS. ++ ++ Enable if you are building for a desktop or low latency system, ++ otherwise say N. ++ + config SPARSE_IRQ + bool "Support sparse irq numbering" if MAY_HAVE_SPARSE_IRQ + ---help--- +diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c +index 1753486b440c..f43423737493 100644 +--- a/kernel/irq/manage.c ++++ b/kernel/irq/manage.c +@@ -24,9 +24,20 @@ + #include "internals.h" + + #if defined(CONFIG_IRQ_FORCED_THREADING) && !defined(CONFIG_PREEMPT_RT) ++#ifdef CONFIG_FORCE_IRQ_THREADING ++__read_mostly bool force_irqthreads = true; ++#else + __read_mostly bool force_irqthreads; ++#endif + EXPORT_SYMBOL_GPL(force_irqthreads); + ++static int __init setup_noforced_irqthreads(char *arg) ++{ ++ force_irqthreads = false; ++ return 0; ++} ++early_param("nothreadirqs", setup_noforced_irqthreads); ++ + static int __init setup_forced_irqthreads(char *arg) + { + force_irqthreads = true; +diff --git a/kernel/kthread.c b/kernel/kthread.c +index b262f47046ca..9797ad652268 100644 +--- a/kernel/kthread.c ++++ b/kernel/kthread.c +@@ -433,6 +433,34 @@ void kthread_bind(struct task_struct *p, unsigned int cpu) + } + EXPORT_SYMBOL(kthread_bind); + ++#if defined(CONFIG_SCHED_MUQSS) && defined(CONFIG_SMP) ++extern void __do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask); ++ ++/* ++ * new_kthread_bind is a special variant of __kthread_bind_mask. ++ * For new threads to work on muqss we want to call do_set_cpus_allowed ++ * without the task_cpu being set and the task rescheduled until they're ++ * rescheduled on their own so we call __do_set_cpus_allowed directly which ++ * only changes the cpumask. This is particularly important for smpboot threads ++ * to work. ++ */ ++static void new_kthread_bind(struct task_struct *p, unsigned int cpu) ++{ ++ unsigned long flags; ++ ++ if (WARN_ON(!wait_task_inactive(p, TASK_UNINTERRUPTIBLE))) ++ return; ++ ++ /* It's safe because the task is inactive. */ ++ raw_spin_lock_irqsave(&p->pi_lock, flags); ++ __do_set_cpus_allowed(p, cpumask_of(cpu)); ++ p->flags |= PF_NO_SETAFFINITY; ++ raw_spin_unlock_irqrestore(&p->pi_lock, flags); ++} ++#else ++#define new_kthread_bind(p, cpu) kthread_bind(p, cpu) ++#endif ++ + /** + * kthread_create_on_cpu - Create a cpu bound kthread + * @threadfn: the function to run until signal_pending(current). +@@ -454,7 +482,7 @@ struct task_struct *kthread_create_on_cpu(int (*threadfn)(void *data), + cpu); + if (IS_ERR(p)) + return p; +- kthread_bind(p, cpu); ++ new_kthread_bind(p, cpu); + /* CPU hotplug need to bind once again when unparking the thread. */ + set_bit(KTHREAD_IS_PER_CPU, &to_kthread(p)->flags); + to_kthread(p)->cpu = cpu; +diff --git a/kernel/livepatch/transition.c b/kernel/livepatch/transition.c +index f6310f848f34..825f9b8e228f 100644 +--- a/kernel/livepatch/transition.c ++++ b/kernel/livepatch/transition.c +@@ -282,7 +282,7 @@ static bool klp_try_switch_task(struct task_struct *task) + { + static char err_buf[STACK_ERR_BUF_SIZE]; + struct rq *rq; +- struct rq_flags flags; ++ struct rq_flags rf; + int ret; + bool success = false; + +@@ -304,7 +304,7 @@ static bool klp_try_switch_task(struct task_struct *task) + * functions. If all goes well, switch the task to the target patch + * state. + */ +- rq = task_rq_lock(task, &flags); ++ rq = task_rq_lock(task, &rf); + + if (task_running(rq, task) && task != current) { + snprintf(err_buf, STACK_ERR_BUF_SIZE, +@@ -323,7 +323,7 @@ static bool klp_try_switch_task(struct task_struct *task) + task->patch_state = klp_target_state; + + done: +- task_rq_unlock(rq, task, &flags); ++ task_rq_unlock(rq, task, &rf); + + /* + * Due to console deadlock issues, pr_debug() can't be used while +diff --git a/kernel/sched/Makefile b/kernel/sched/Makefile +index 21fb5a5662b5..a04ffebc6b7a 100644 +--- a/kernel/sched/Makefile ++++ b/kernel/sched/Makefile +@@ -16,15 +16,23 @@ ifneq ($(CONFIG_SCHED_OMIT_FRAME_POINTER),y) + CFLAGS_core.o := $(PROFILING) -fno-omit-frame-pointer + endif + ++ifdef CONFIG_SCHED_MUQSS ++obj-y += MuQSS.o clock.o cputime.o ++obj-y += idle.o ++obj-y += wait.o wait_bit.o swait.o completion.o ++ ++obj-$(CONFIG_SMP) += topology.o ++else + obj-y += core.o loadavg.o clock.o cputime.o + obj-y += idle.o fair.o rt.o deadline.o + obj-y += wait.o wait_bit.o swait.o completion.o + + obj-$(CONFIG_SMP) += cpupri.o cpudeadline.o topology.o stop_task.o pelt.o + obj-$(CONFIG_SCHED_AUTOGROUP) += autogroup.o +-obj-$(CONFIG_SCHEDSTATS) += stats.o + obj-$(CONFIG_SCHED_DEBUG) += debug.o + obj-$(CONFIG_CGROUP_CPUACCT) += cpuacct.o ++endif ++obj-$(CONFIG_SCHEDSTATS) += stats.o + obj-$(CONFIG_CPU_FREQ) += cpufreq.o + obj-$(CONFIG_CPU_FREQ_GOV_SCHEDUTIL) += cpufreq_schedutil.o + obj-$(CONFIG_MEMBARRIER) += membarrier.o +diff --git a/kernel/sched/MuQSS.c b/kernel/sched/MuQSS.c +new file mode 100644 +index 000000000000..b8b35546c416 +--- /dev/null ++++ b/kernel/sched/MuQSS.c +@@ -0,0 +1,7607 @@ ++// SPDX-License-Identifier: GPL-2.0 ++/* ++ * kernel/sched/MuQSS.c, was kernel/sched.c ++ * ++ * Kernel scheduler and related syscalls ++ * ++ * Copyright (C) 1991-2002 Linus Torvalds ++ * ++ * 1996-12-23 Modified by Dave Grothe to fix bugs in semaphores and ++ * make semaphores SMP safe ++ * 1998-11-19 Implemented schedule_timeout() and related stuff ++ * by Andrea Arcangeli ++ * 2002-01-04 New ultra-scalable O(1) scheduler by Ingo Molnar: ++ * hybrid priority-list and round-robin design with ++ * an array-switch method of distributing timeslices ++ * and per-CPU runqueues. Cleanups and useful suggestions ++ * by Davide Libenzi, preemptible kernel bits by Robert Love. ++ * 2003-09-03 Interactivity tuning by Con Kolivas. ++ * 2004-04-02 Scheduler domains code by Nick Piggin ++ * 2007-04-15 Work begun on replacing all interactivity tuning with a ++ * fair scheduling design by Con Kolivas. ++ * 2007-05-05 Load balancing (smp-nice) and other improvements ++ * by Peter Williams ++ * 2007-05-06 Interactivity improvements to CFS by Mike Galbraith ++ * 2007-07-01 Group scheduling enhancements by Srivatsa Vaddagiri ++ * 2007-11-29 RT balancing improvements by Steven Rostedt, Gregory Haskins, ++ * Thomas Gleixner, Mike Kravetz ++ * 2009-08-13 Brainfuck deadline scheduling policy by Con Kolivas deletes ++ * a whole lot of those previous things. ++ * 2016-10-01 Multiple Queue Skiplist Scheduler scalable evolution of BFS ++ * scheduler by Con Kolivas. ++ * 2019-08-31 LLC bits by Eduards Bezverhijs ++ */ ++ ++#include ++#include ++ ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++ ++#include ++#include ++#include ++ ++#include "../workqueue_internal.h" ++#include "../../fs/io-wq.h" ++#include "../smpboot.h" ++ ++#define CREATE_TRACE_POINTS ++#include ++ ++#include "MuQSS.h" ++ ++#define rt_prio(prio) unlikely((prio) < MAX_RT_PRIO) ++#define rt_task(p) rt_prio((p)->prio) ++#define batch_task(p) (unlikely((p)->policy == SCHED_BATCH)) ++#define is_rt_policy(policy) ((policy) == SCHED_FIFO || \ ++ (policy) == SCHED_RR) ++#define has_rt_policy(p) unlikely(is_rt_policy((p)->policy)) ++ ++#define is_idle_policy(policy) ((policy) == SCHED_IDLEPRIO) ++#define idleprio_task(p) unlikely(is_idle_policy((p)->policy)) ++#define task_running_idle(p) unlikely((p)->prio == IDLE_PRIO) ++ ++#define is_iso_policy(policy) ((policy) == SCHED_ISO) ++#define iso_task(p) unlikely(is_iso_policy((p)->policy)) ++#define task_running_iso(p) unlikely((p)->prio == ISO_PRIO) ++ ++#define rq_idle(rq) ((rq)->rq_prio == PRIO_LIMIT) ++ ++#define ISO_PERIOD (5 * HZ) ++ ++#define STOP_PRIO (MAX_RT_PRIO - 1) ++ ++/* ++ * Some helpers for converting to/from various scales. Use shifts to get ++ * approximate multiples of ten for less overhead. ++ */ ++#define APPROX_NS_PS (1073741824) /* Approximate ns per second */ ++#define JIFFIES_TO_NS(TIME) ((TIME) * (APPROX_NS_PS / HZ)) ++#define JIFFY_NS (APPROX_NS_PS / HZ) ++#define JIFFY_US (1048576 / HZ) ++#define NS_TO_JIFFIES(TIME) ((TIME) / JIFFY_NS) ++#define HALF_JIFFY_NS (APPROX_NS_PS / HZ / 2) ++#define HALF_JIFFY_US (1048576 / HZ / 2) ++#define MS_TO_NS(TIME) ((TIME) << 20) ++#define MS_TO_US(TIME) ((TIME) << 10) ++#define NS_TO_MS(TIME) ((TIME) >> 20) ++#define NS_TO_US(TIME) ((TIME) >> 10) ++#define US_TO_NS(TIME) ((TIME) << 10) ++#define TICK_APPROX_NS ((APPROX_NS_PS+HZ/2)/HZ) ++ ++#define RESCHED_US (100) /* Reschedule if less than this many μs left */ ++ ++void print_scheduler_version(void) ++{ ++ printk(KERN_INFO "MuQSS CPU scheduler v0.198 by Con Kolivas.\n"); ++} ++ ++/* Define RQ share levels */ ++#define RQSHARE_NONE 0 ++#define RQSHARE_SMT 1 ++#define RQSHARE_MC 2 ++#define RQSHARE_MC_LLC 3 ++#define RQSHARE_SMP 4 ++#define RQSHARE_ALL 5 ++ ++/* Define locality levels */ ++#define LOCALITY_SAME 0 ++#define LOCALITY_SMT 1 ++#define LOCALITY_MC_LLC 2 ++#define LOCALITY_MC 3 ++#define LOCALITY_SMP 4 ++#define LOCALITY_DISTANT 5 ++ ++/* ++ * This determines what level of runqueue sharing will be done and is ++ * configurable at boot time with the bootparam rqshare = ++ */ ++static int rqshare __read_mostly = CONFIG_SHARERQ; /* Default RQSHARE_MC */ ++ ++static int __init set_rqshare(char *str) ++{ ++ if (!strncmp(str, "none", 4)) { ++ rqshare = RQSHARE_NONE; ++ return 0; ++ } ++ if (!strncmp(str, "smt", 3)) { ++ rqshare = RQSHARE_SMT; ++ return 0; ++ } ++ if (!strncmp(str, "mc", 2)) { ++ rqshare = RQSHARE_MC; ++ return 0; ++ } ++ if (!strncmp(str, "llc", 3)) { ++ rqshare = RQSHARE_MC_LLC; ++ return 0; ++ } ++ if (!strncmp(str, "smp", 3)) { ++ rqshare = RQSHARE_SMP; ++ return 0; ++ } ++ if (!strncmp(str, "all", 3)) { ++ rqshare = RQSHARE_ALL; ++ return 0; ++ } ++ return 1; ++} ++__setup("rqshare=", set_rqshare); ++ ++/* ++ * This is the time all tasks within the same priority round robin. ++ * Value is in ms and set to a minimum of 6ms. ++ * Tunable via /proc interface. ++ */ ++int rr_interval __read_mostly = 6; ++ ++/* ++ * Tunable to choose whether to prioritise latency or throughput, simple ++ * binary yes or no ++ */ ++int sched_interactive __read_mostly = 1; ++ ++/* ++ * sched_iso_cpu - sysctl which determines the cpu percentage SCHED_ISO tasks ++ * are allowed to run five seconds as real time tasks. This is the total over ++ * all online cpus. ++ */ ++int sched_iso_cpu __read_mostly = 70; ++ ++/* ++ * sched_yield_type - Choose what sort of yield sched_yield will perform. ++ * 0: No yield. ++ * 1: Yield only to better priority/deadline tasks. (default) ++ * 2: Expire timeslice and recalculate deadline. ++ */ ++int sched_yield_type __read_mostly = 1; ++ ++/* ++ * The relative length of deadline for each priority(nice) level. ++ */ ++static int prio_ratios[NICE_WIDTH] __read_mostly; ++ ++ ++/* ++ * The quota handed out to tasks of all priority levels when refilling their ++ * time_slice. ++ */ ++static inline int timeslice(void) ++{ ++ return MS_TO_US(rr_interval); ++} ++ ++DEFINE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues); ++ ++#ifdef CONFIG_SMP ++/* ++ * Total number of runqueues. Equals number of CPUs when there is no runqueue ++ * sharing but is usually less with SMT/MC sharing of runqueues. ++ */ ++static int total_runqueues __read_mostly = 1; ++ ++static cpumask_t cpu_idle_map ____cacheline_aligned_in_smp; ++ ++struct rq *cpu_rq(int cpu) ++{ ++ return &per_cpu(runqueues, (cpu)); ++} ++#define cpu_curr(cpu) (cpu_rq(cpu)->curr) ++ ++/* ++ * For asym packing, by default the lower numbered cpu has higher priority. ++ */ ++int __weak arch_asym_cpu_priority(int cpu) ++{ ++ return -cpu; ++} ++ ++int __weak arch_sd_sibling_asym_packing(void) ++{ ++ return 0*SD_ASYM_PACKING; ++} ++ ++#ifdef CONFIG_SCHED_SMT ++DEFINE_STATIC_KEY_FALSE(sched_smt_present); ++EXPORT_SYMBOL_GPL(sched_smt_present); ++#endif ++ ++#else ++struct rq *uprq; ++#endif /* CONFIG_SMP */ ++ ++#include "stats.h" ++ ++/* ++ * All common locking functions performed on rq->lock. rq->clock is local to ++ * the CPU accessing it so it can be modified just with interrupts disabled ++ * when we're not updating niffies. ++ * Looking up task_rq must be done under rq->lock to be safe. ++ */ ++ ++/* ++ * RQ-clock updating methods: ++ */ ++ ++#ifdef HAVE_SCHED_AVG_IRQ ++static void update_irq_load_avg(struct rq *rq, long delta); ++#else ++static inline void update_irq_load_avg(struct rq *rq, long delta) {} ++#endif ++ ++static void update_rq_clock_task(struct rq *rq, s64 delta) ++{ ++/* ++ * In theory, the compile should just see 0 here, and optimize out the call ++ * to sched_rt_avg_update. But I don't trust it... ++ */ ++ s64 __maybe_unused steal = 0, irq_delta = 0; ++#ifdef CONFIG_IRQ_TIME_ACCOUNTING ++ irq_delta = irq_time_read(cpu_of(rq)) - rq->prev_irq_time; ++ ++ /* ++ * Since irq_time is only updated on {soft,}irq_exit, we might run into ++ * this case when a previous update_rq_clock() happened inside a ++ * {soft,}irq region. ++ * ++ * When this happens, we stop ->clock_task and only update the ++ * prev_irq_time stamp to account for the part that fit, so that a next ++ * update will consume the rest. This ensures ->clock_task is ++ * monotonic. ++ * ++ * It does however cause some slight miss-attribution of {soft,}irq ++ * time, a more accurate solution would be to update the irq_time using ++ * the current rq->clock timestamp, except that would require using ++ * atomic ops. ++ */ ++ if (irq_delta > delta) ++ irq_delta = delta; ++ ++ rq->prev_irq_time += irq_delta; ++ delta -= irq_delta; ++#endif ++#ifdef CONFIG_PARAVIRT_TIME_ACCOUNTING ++ if (static_key_false((¶virt_steal_rq_enabled))) { ++ steal = paravirt_steal_clock(cpu_of(rq)); ++ steal -= rq->prev_steal_time_rq; ++ ++ if (unlikely(steal > delta)) ++ steal = delta; ++ ++ rq->prev_steal_time_rq += steal; ++ delta -= steal; ++ } ++#endif ++ rq->clock_task += delta; ++ ++#ifdef CONFIG_HAVE_SCHED_AVG_IRQ ++ if (irq_delta + steal) ++ update_irq_load_avg(rq, irq_delta + steal); ++#endif ++} ++ ++static inline void update_rq_clock(struct rq *rq) ++{ ++ s64 delta = sched_clock_cpu(cpu_of(rq)) - rq->clock; ++ ++ if (unlikely(delta < 0)) ++ return; ++ rq->clock += delta; ++ update_rq_clock_task(rq, delta); ++} ++ ++/* ++ * Niffies are a globally increasing nanosecond counter. They're only used by ++ * update_load_avg and time_slice_expired, however deadlines are based on them ++ * across CPUs. Update them whenever we will call one of those functions, and ++ * synchronise them across CPUs whenever we hold both runqueue locks. ++ */ ++static inline void update_clocks(struct rq *rq) ++{ ++ s64 ndiff, minndiff; ++ long jdiff; ++ ++ update_rq_clock(rq); ++ ndiff = rq->clock - rq->old_clock; ++ rq->old_clock = rq->clock; ++ jdiff = jiffies - rq->last_jiffy; ++ ++ /* Subtract any niffies added by balancing with other rqs */ ++ ndiff -= rq->niffies - rq->last_niffy; ++ minndiff = JIFFIES_TO_NS(jdiff) - rq->niffies + rq->last_jiffy_niffies; ++ if (minndiff < 0) ++ minndiff = 0; ++ ndiff = max(ndiff, minndiff); ++ rq->niffies += ndiff; ++ rq->last_niffy = rq->niffies; ++ if (jdiff) { ++ rq->last_jiffy += jdiff; ++ rq->last_jiffy_niffies = rq->niffies; ++ } ++} ++ ++/* ++ * Any time we have two runqueues locked we use that as an opportunity to ++ * synchronise niffies to the highest value as idle ticks may have artificially ++ * kept niffies low on one CPU and the truth can only be later. ++ */ ++static inline void synchronise_niffies(struct rq *rq1, struct rq *rq2) ++{ ++ if (rq1->niffies > rq2->niffies) ++ rq2->niffies = rq1->niffies; ++ else ++ rq1->niffies = rq2->niffies; ++} ++ ++/* ++ * double_rq_lock - safely lock two runqueues ++ * ++ * Note this does not disable interrupts like task_rq_lock, ++ * you need to do so manually before calling. ++ */ ++ ++/* For when we know rq1 != rq2 */ ++static inline void __double_rq_lock(struct rq *rq1, struct rq *rq2) ++ __acquires(rq1->lock) ++ __acquires(rq2->lock) ++{ ++ if (rq1 < rq2) { ++ raw_spin_lock(rq1->lock); ++ raw_spin_lock_nested(rq2->lock, SINGLE_DEPTH_NESTING); ++ } else { ++ raw_spin_lock(rq2->lock); ++ raw_spin_lock_nested(rq1->lock, SINGLE_DEPTH_NESTING); ++ } ++} ++ ++static inline void double_rq_lock(struct rq *rq1, struct rq *rq2) ++ __acquires(rq1->lock) ++ __acquires(rq2->lock) ++{ ++ BUG_ON(!irqs_disabled()); ++ if (rq1->lock == rq2->lock) { ++ raw_spin_lock(rq1->lock); ++ __acquire(rq2->lock); /* Fake it out ;) */ ++ } else ++ __double_rq_lock(rq1, rq2); ++ synchronise_niffies(rq1, rq2); ++} ++ ++/* ++ * double_rq_unlock - safely unlock two runqueues ++ * ++ * Note this does not restore interrupts like task_rq_unlock, ++ * you need to do so manually after calling. ++ */ ++static inline void double_rq_unlock(struct rq *rq1, struct rq *rq2) ++ __releases(rq1->lock) ++ __releases(rq2->lock) ++{ ++ raw_spin_unlock(rq1->lock); ++ if (rq1->lock != rq2->lock) ++ raw_spin_unlock(rq2->lock); ++ else ++ __release(rq2->lock); ++} ++ ++static inline void lock_all_rqs(void) ++{ ++ int cpu; ++ ++ preempt_disable(); ++ for_each_possible_cpu(cpu) { ++ struct rq *rq = cpu_rq(cpu); ++ ++ do_raw_spin_lock(rq->lock); ++ } ++} ++ ++static inline void unlock_all_rqs(void) ++{ ++ int cpu; ++ ++ for_each_possible_cpu(cpu) { ++ struct rq *rq = cpu_rq(cpu); ++ ++ do_raw_spin_unlock(rq->lock); ++ } ++ preempt_enable(); ++} ++ ++/* Specially nest trylock an rq */ ++static inline bool trylock_rq(struct rq *this_rq, struct rq *rq) ++{ ++ if (unlikely(!do_raw_spin_trylock(rq->lock))) ++ return false; ++ spin_acquire(&rq->lock->dep_map, SINGLE_DEPTH_NESTING, 1, _RET_IP_); ++ synchronise_niffies(this_rq, rq); ++ return true; ++} ++ ++/* Unlock a specially nested trylocked rq */ ++static inline void unlock_rq(struct rq *rq) ++{ ++ spin_release(&rq->lock->dep_map, _RET_IP_); ++ do_raw_spin_unlock(rq->lock); ++} ++ ++/* ++ * cmpxchg based fetch_or, macro so it works for different integer types ++ */ ++#define fetch_or(ptr, mask) \ ++ ({ \ ++ typeof(ptr) _ptr = (ptr); \ ++ typeof(mask) _mask = (mask); \ ++ typeof(*_ptr) _old, _val = *_ptr; \ ++ \ ++ for (;;) { \ ++ _old = cmpxchg(_ptr, _val, _val | _mask); \ ++ if (_old == _val) \ ++ break; \ ++ _val = _old; \ ++ } \ ++ _old; \ ++}) ++ ++#if defined(CONFIG_SMP) && defined(TIF_POLLING_NRFLAG) ++/* ++ * Atomically set TIF_NEED_RESCHED and test for TIF_POLLING_NRFLAG, ++ * this avoids any races wrt polling state changes and thereby avoids ++ * spurious IPIs. ++ */ ++static bool set_nr_and_not_polling(struct task_struct *p) ++{ ++ struct thread_info *ti = task_thread_info(p); ++ return !(fetch_or(&ti->flags, _TIF_NEED_RESCHED) & _TIF_POLLING_NRFLAG); ++} ++ ++/* ++ * Atomically set TIF_NEED_RESCHED if TIF_POLLING_NRFLAG is set. ++ * ++ * If this returns true, then the idle task promises to call ++ * sched_ttwu_pending() and reschedule soon. ++ */ ++static bool set_nr_if_polling(struct task_struct *p) ++{ ++ struct thread_info *ti = task_thread_info(p); ++ typeof(ti->flags) old, val = READ_ONCE(ti->flags); ++ ++ for (;;) { ++ if (!(val & _TIF_POLLING_NRFLAG)) ++ return false; ++ if (val & _TIF_NEED_RESCHED) ++ return true; ++ old = cmpxchg(&ti->flags, val, val | _TIF_NEED_RESCHED); ++ if (old == val) ++ break; ++ val = old; ++ } ++ return true; ++} ++ ++#else ++static bool set_nr_and_not_polling(struct task_struct *p) ++{ ++ set_tsk_need_resched(p); ++ return true; ++} ++ ++#ifdef CONFIG_SMP ++static bool set_nr_if_polling(struct task_struct *p) ++{ ++ return false; ++} ++#endif ++#endif ++ ++static bool __wake_q_add(struct wake_q_head *head, struct task_struct *task) ++{ ++ struct wake_q_node *node = &task->wake_q; ++ ++ /* ++ * Atomically grab the task, if ->wake_q is !nil already it means ++ * its already queued (either by us or someone else) and will get the ++ * wakeup due to that. ++ * ++ * In order to ensure that a pending wakeup will observe our pending ++ * state, even in the failed case, an explicit smp_mb() must be used. ++ */ ++ smp_mb__before_atomic(); ++ if (unlikely(cmpxchg_relaxed(&node->next, NULL, WAKE_Q_TAIL))) ++ return false; ++ ++ /* ++ * The head is context local, there can be no concurrency. ++ */ ++ *head->lastp = node; ++ head->lastp = &node->next; ++ return true; ++} ++ ++/** ++ * wake_q_add() - queue a wakeup for 'later' waking. ++ * @head: the wake_q_head to add @task to ++ * @task: the task to queue for 'later' wakeup ++ * ++ * Queue a task for later wakeup, most likely by the wake_up_q() call in the ++ * same context, _HOWEVER_ this is not guaranteed, the wakeup can come ++ * instantly. ++ * ++ * This function must be used as-if it were wake_up_process(); IOW the task ++ * must be ready to be woken at this location. ++ */ ++void wake_q_add(struct wake_q_head *head, struct task_struct *task) ++{ ++ if (__wake_q_add(head, task)) ++ get_task_struct(task); ++} ++ ++/** ++ * wake_q_add_safe() - safely queue a wakeup for 'later' waking. ++ * @head: the wake_q_head to add @task to ++ * @task: the task to queue for 'later' wakeup ++ * ++ * Queue a task for later wakeup, most likely by the wake_up_q() call in the ++ * same context, _HOWEVER_ this is not guaranteed, the wakeup can come ++ * instantly. ++ * ++ * This function must be used as-if it were wake_up_process(); IOW the task ++ * must be ready to be woken at this location. ++ * ++ * This function is essentially a task-safe equivalent to wake_q_add(). Callers ++ * that already hold reference to @task can call the 'safe' version and trust ++ * wake_q to do the right thing depending whether or not the @task is already ++ * queued for wakeup. ++ */ ++void wake_q_add_safe(struct wake_q_head *head, struct task_struct *task) ++{ ++ if (!__wake_q_add(head, task)) ++ put_task_struct(task); ++} ++ ++void wake_up_q(struct wake_q_head *head) ++{ ++ struct wake_q_node *node = head->first; ++ ++ while (node != WAKE_Q_TAIL) { ++ struct task_struct *task; ++ ++ task = container_of(node, struct task_struct, wake_q); ++ BUG_ON(!task); ++ /* Task can safely be re-inserted now */ ++ node = node->next; ++ task->wake_q.next = NULL; ++ ++ /* ++ * wake_up_process() executes a full barrier, which pairs with ++ * the queueing in wake_q_add() so as not to miss wakeups. ++ */ ++ wake_up_process(task); ++ put_task_struct(task); ++ } ++} ++ ++static inline void smp_sched_reschedule(int cpu) ++{ ++ if (likely(cpu_online(cpu))) ++ smp_send_reschedule(cpu); ++} ++ ++/* ++ * resched_task - mark a task 'to be rescheduled now'. ++ * ++ * On UP this means the setting of the need_resched flag, on SMP it ++ * might also involve a cross-CPU call to trigger the scheduler on ++ * the target CPU. ++ */ ++void resched_task(struct task_struct *p) ++{ ++ int cpu; ++#ifdef CONFIG_LOCKDEP ++ /* Kernel threads call this when creating workqueues while still ++ * inactive from __kthread_bind_mask, holding only the pi_lock */ ++ if (!(p->flags & PF_KTHREAD)) { ++ struct rq *rq = task_rq(p); ++ ++ lockdep_assert_held(rq->lock); ++ } ++#endif ++ if (test_tsk_need_resched(p)) ++ return; ++ ++ cpu = task_cpu(p); ++ if (cpu == smp_processor_id()) { ++ set_tsk_need_resched(p); ++ set_preempt_need_resched(); ++ return; ++ } ++ ++ if (set_nr_and_not_polling(p)) ++ smp_sched_reschedule(cpu); ++ else ++ trace_sched_wake_idle_without_ipi(cpu); ++} ++ ++/* ++ * A task that is not running or queued will not have a node set. ++ * A task that is queued but not running will have a node set. ++ * A task that is currently running will have ->on_cpu set but no node set. ++ */ ++static inline bool task_queued(struct task_struct *p) ++{ ++ return !skiplist_node_empty(&p->node); ++} ++ ++static void enqueue_task(struct rq *rq, struct task_struct *p, int flags); ++static inline void resched_if_idle(struct rq *rq); ++ ++/* Dodgy workaround till we figure out where the softirqs are going */ ++static inline void do_pending_softirq(struct rq *rq, struct task_struct *next) ++{ ++ if (unlikely(next == rq->idle && local_softirq_pending() && !in_interrupt())) ++ do_softirq_own_stack(); ++} ++ ++static inline bool deadline_before(u64 deadline, u64 time) ++{ ++ return (deadline < time); ++} ++ ++/* ++ * Deadline is "now" in niffies + (offset by priority). Setting the deadline ++ * is the key to everything. It distributes cpu fairly amongst tasks of the ++ * same nice value, it proportions cpu according to nice level, it means the ++ * task that last woke up the longest ago has the earliest deadline, thus ++ * ensuring that interactive tasks get low latency on wake up. The CPU ++ * proportion works out to the square of the virtual deadline difference, so ++ * this equation will give nice 19 3% CPU compared to nice 0. ++ */ ++static inline u64 prio_deadline_diff(int user_prio) ++{ ++ return (prio_ratios[user_prio] * rr_interval * (MS_TO_NS(1) / 128)); ++} ++ ++static inline u64 task_deadline_diff(struct task_struct *p) ++{ ++ return prio_deadline_diff(TASK_USER_PRIO(p)); ++} ++ ++static inline u64 static_deadline_diff(int static_prio) ++{ ++ return prio_deadline_diff(USER_PRIO(static_prio)); ++} ++ ++static inline int longest_deadline_diff(void) ++{ ++ return prio_deadline_diff(39); ++} ++ ++static inline int ms_longest_deadline_diff(void) ++{ ++ return NS_TO_MS(longest_deadline_diff()); ++} ++ ++static inline bool rq_local(struct rq *rq); ++ ++#ifndef SCHED_CAPACITY_SCALE ++#define SCHED_CAPACITY_SCALE 1024 ++#endif ++ ++static inline int rq_load(struct rq *rq) ++{ ++ return rq->nr_running; ++} ++ ++/* ++ * Update the load average for feeding into cpu frequency governors. Use a ++ * rough estimate of a rolling average with ~ time constant of 32ms. ++ * 80/128 ~ 0.63. * 80 / 32768 / 128 == * 5 / 262144 ++ * Make sure a call to update_clocks has been made before calling this to get ++ * an updated rq->niffies. ++ */ ++static void update_load_avg(struct rq *rq, unsigned int flags) ++{ ++ long us_interval, load; ++ unsigned long curload; ++ ++ us_interval = NS_TO_US(rq->niffies - rq->load_update); ++ if (unlikely(us_interval <= 0)) ++ return; ++ ++ curload = rq_load(rq); ++ load = rq->load_avg - (rq->load_avg * us_interval * 5 / 262144); ++ if (unlikely(load < 0)) ++ load = 0; ++ load += curload * curload * SCHED_CAPACITY_SCALE * us_interval * 5 / 262144; ++ rq->load_avg = load; ++ ++ rq->load_update = rq->niffies; ++ update_irq_load_avg(rq, 0); ++ if (likely(rq_local(rq))) ++ cpufreq_trigger(rq, flags); ++} ++ ++#ifdef HAVE_SCHED_AVG_IRQ ++/* ++ * IRQ variant of update_load_avg below. delta is actually time in nanoseconds ++ * here so we scale curload to how long it's been since the last update. ++ */ ++static void update_irq_load_avg(struct rq *rq, long delta) ++{ ++ long us_interval, load; ++ unsigned long curload; ++ ++ us_interval = NS_TO_US(rq->niffies - rq->irq_load_update); ++ if (unlikely(us_interval <= 0)) ++ return; ++ ++ curload = NS_TO_US(delta) / us_interval; ++ load = rq->irq_load_avg - (rq->irq_load_avg * us_interval * 5 / 262144); ++ if (unlikely(load < 0)) ++ load = 0; ++ load += curload * curload * SCHED_CAPACITY_SCALE * us_interval * 5 / 262144; ++ rq->irq_load_avg = load; ++ ++ rq->irq_load_update = rq->niffies; ++} ++#endif ++ ++/* ++ * Removing from the runqueue. Enter with rq locked. Deleting a task ++ * from the skip list is done via the stored node reference in the task struct ++ * and does not require a full look up. Thus it occurs in O(k) time where k ++ * is the "level" of the list the task was stored at - usually < 4, max 8. ++ */ ++static void dequeue_task(struct rq *rq, struct task_struct *p, int flags) ++{ ++ skiplist_delete(rq->sl, &p->node); ++ rq->best_key = rq->node->next[0]->key; ++ update_clocks(rq); ++ ++ if (!(flags & DEQUEUE_SAVE)) { ++ sched_info_dequeued(rq, p); ++ psi_dequeue(p, flags & DEQUEUE_SLEEP); ++ } ++ rq->nr_running--; ++ if (rt_task(p)) ++ rq->rt_nr_running--; ++ update_load_avg(rq, flags); ++} ++ ++#ifdef CONFIG_PREEMPT_RCU ++static bool rcu_read_critical(struct task_struct *p) ++{ ++ return p->rcu_read_unlock_special.b.blocked; ++} ++#else /* CONFIG_PREEMPT_RCU */ ++#define rcu_read_critical(p) (false) ++#endif /* CONFIG_PREEMPT_RCU */ ++ ++/* ++ * To determine if it's safe for a task of SCHED_IDLEPRIO to actually run as ++ * an idle task, we ensure none of the following conditions are met. ++ */ ++static bool idleprio_suitable(struct task_struct *p) ++{ ++ return (!(task_contributes_to_load(p)) && !(p->flags & (PF_EXITING)) && ++ !signal_pending(p) && !rcu_read_critical(p) && !freezing(p)); ++} ++ ++/* ++ * To determine if a task of SCHED_ISO can run in pseudo-realtime, we check ++ * that the iso_refractory flag is not set. ++ */ ++static inline bool isoprio_suitable(struct rq *rq) ++{ ++ return !rq->iso_refractory; ++} ++ ++/* ++ * Adding to the runqueue. Enter with rq locked. ++ */ ++static void enqueue_task(struct rq *rq, struct task_struct *p, int flags) ++{ ++ unsigned int randseed, cflags = 0; ++ u64 sl_id; ++ ++ if (!rt_task(p)) { ++ /* Check it hasn't gotten rt from PI */ ++ if ((idleprio_task(p) && idleprio_suitable(p)) || ++ (iso_task(p) && isoprio_suitable(rq))) ++ p->prio = p->normal_prio; ++ else ++ p->prio = NORMAL_PRIO; ++ } else ++ rq->rt_nr_running++; ++ /* ++ * The sl_id key passed to the skiplist generates a sorted list. ++ * Realtime and sched iso tasks run FIFO so they only need be sorted ++ * according to priority. The skiplist will put tasks of the same ++ * key inserted later in FIFO order. Tasks of sched normal, batch ++ * and idleprio are sorted according to their deadlines. Idleprio ++ * tasks are offset by an impossibly large deadline value ensuring ++ * they get sorted into last positions, but still according to their ++ * own deadlines. This creates a "landscape" of skiplists running ++ * from priority 0 realtime in first place to the lowest priority ++ * idleprio tasks last. Skiplist insertion is an O(log n) process. ++ */ ++ if (p->prio <= ISO_PRIO) { ++ sl_id = p->prio; ++ } else { ++ sl_id = p->deadline; ++ if (idleprio_task(p)) { ++ if (p->prio == IDLE_PRIO) ++ sl_id |= 0xF000000000000000; ++ else ++ sl_id += longest_deadline_diff(); ++ } ++ } ++ /* ++ * Some architectures don't have better than microsecond resolution ++ * so mask out ~microseconds as the random seed for skiplist insertion. ++ */ ++ update_clocks(rq); ++ if (!(flags & ENQUEUE_RESTORE)) { ++ sched_info_queued(rq, p); ++ psi_enqueue(p, flags & ENQUEUE_WAKEUP); ++ } ++ ++ randseed = (rq->niffies >> 10) & 0xFFFFFFFF; ++ skiplist_insert(rq->sl, &p->node, sl_id, p, randseed); ++ rq->best_key = rq->node->next[0]->key; ++ if (p->in_iowait) ++ cflags |= SCHED_CPUFREQ_IOWAIT; ++ rq->nr_running++; ++ update_load_avg(rq, cflags); ++} ++ ++/* ++ * Returns the relative length of deadline all compared to the shortest ++ * deadline which is that of nice -20. ++ */ ++static inline int task_prio_ratio(struct task_struct *p) ++{ ++ return prio_ratios[TASK_USER_PRIO(p)]; ++} ++ ++/* ++ * task_timeslice - all tasks of all priorities get the exact same timeslice ++ * length. CPU distribution is handled by giving different deadlines to ++ * tasks of different priorities. Use 128 as the base value for fast shifts. ++ */ ++static inline int task_timeslice(struct task_struct *p) ++{ ++ return (rr_interval * task_prio_ratio(p) / 128); ++} ++ ++#ifdef CONFIG_SMP ++/* Entered with rq locked */ ++static inline void resched_if_idle(struct rq *rq) ++{ ++ if (rq_idle(rq)) ++ resched_task(rq->curr); ++} ++ ++static inline bool rq_local(struct rq *rq) ++{ ++ return (rq->cpu == smp_processor_id()); ++} ++#ifdef CONFIG_SMT_NICE ++static const cpumask_t *thread_cpumask(int cpu); ++ ++/* Find the best real time priority running on any SMT siblings of cpu and if ++ * none are running, the static priority of the best deadline task running. ++ * The lookups to the other runqueues is done lockless as the occasional wrong ++ * value would be harmless. */ ++static int best_smt_bias(struct rq *this_rq) ++{ ++ int other_cpu, best_bias = 0; ++ ++ for_each_cpu(other_cpu, &this_rq->thread_mask) { ++ struct rq *rq = cpu_rq(other_cpu); ++ ++ if (rq_idle(rq)) ++ continue; ++ if (unlikely(!rq->online)) ++ continue; ++ if (!rq->rq_mm) ++ continue; ++ if (likely(rq->rq_smt_bias > best_bias)) ++ best_bias = rq->rq_smt_bias; ++ } ++ return best_bias; ++} ++ ++static int task_prio_bias(struct task_struct *p) ++{ ++ if (rt_task(p)) ++ return 1 << 30; ++ else if (task_running_iso(p)) ++ return 1 << 29; ++ else if (task_running_idle(p)) ++ return 0; ++ return MAX_PRIO - p->static_prio; ++} ++ ++static bool smt_always_schedule(struct task_struct __maybe_unused *p, struct rq __maybe_unused *this_rq) ++{ ++ return true; ++} ++ ++static bool (*smt_schedule)(struct task_struct *p, struct rq *this_rq) = &smt_always_schedule; ++ ++/* We've already decided p can run on CPU, now test if it shouldn't for SMT ++ * nice reasons. */ ++static bool smt_should_schedule(struct task_struct *p, struct rq *this_rq) ++{ ++ int best_bias, task_bias; ++ ++ /* Kernel threads always run */ ++ if (unlikely(!p->mm)) ++ return true; ++ if (rt_task(p)) ++ return true; ++ if (!idleprio_suitable(p)) ++ return true; ++ best_bias = best_smt_bias(this_rq); ++ /* The smt siblings are all idle or running IDLEPRIO */ ++ if (best_bias < 1) ++ return true; ++ task_bias = task_prio_bias(p); ++ if (task_bias < 1) ++ return false; ++ if (task_bias >= best_bias) ++ return true; ++ /* Dither 25% cpu of normal tasks regardless of nice difference */ ++ if (best_bias % 4 == 1) ++ return true; ++ /* Sorry, you lose */ ++ return false; ++} ++#else /* CONFIG_SMT_NICE */ ++#define smt_schedule(p, this_rq) (true) ++#endif /* CONFIG_SMT_NICE */ ++ ++static inline void atomic_set_cpu(int cpu, cpumask_t *cpumask) ++{ ++ set_bit(cpu, (volatile unsigned long *)cpumask); ++} ++ ++/* ++ * The cpu_idle_map stores a bitmap of all the CPUs currently idle to ++ * allow easy lookup of whether any suitable idle CPUs are available. ++ * It's cheaper to maintain a binary yes/no if there are any idle CPUs on the ++ * idle_cpus variable than to do a full bitmask check when we are busy. The ++ * bits are set atomically but read locklessly as occasional false positive / ++ * negative is harmless. ++ */ ++static inline void set_cpuidle_map(int cpu) ++{ ++ if (likely(cpu_online(cpu))) ++ atomic_set_cpu(cpu, &cpu_idle_map); ++} ++ ++static inline void atomic_clear_cpu(int cpu, cpumask_t *cpumask) ++{ ++ clear_bit(cpu, (volatile unsigned long *)cpumask); ++} ++ ++static inline void clear_cpuidle_map(int cpu) ++{ ++ atomic_clear_cpu(cpu, &cpu_idle_map); ++} ++ ++static bool suitable_idle_cpus(struct task_struct *p) ++{ ++ return (cpumask_intersects(p->cpus_ptr, &cpu_idle_map)); ++} ++ ++/* ++ * Resched current on rq. We don't know if rq is local to this CPU nor if it ++ * is locked so we do not use an intermediate variable for the task to avoid ++ * having it dereferenced. ++ */ ++static void resched_curr(struct rq *rq) ++{ ++ int cpu; ++ ++ if (test_tsk_need_resched(rq->curr)) ++ return; ++ ++ rq->preempt = rq->curr; ++ cpu = rq->cpu; ++ ++ /* We're doing this without holding the rq lock if it's not task_rq */ ++ ++ if (cpu == smp_processor_id()) { ++ set_tsk_need_resched(rq->curr); ++ set_preempt_need_resched(); ++ return; ++ } ++ ++ if (set_nr_and_not_polling(rq->curr)) ++ smp_sched_reschedule(cpu); ++ else ++ trace_sched_wake_idle_without_ipi(cpu); ++} ++ ++#define CPUIDLE_DIFF_THREAD (1) ++#define CPUIDLE_DIFF_CORE_LLC (2) ++#define CPUIDLE_DIFF_CORE (4) ++#define CPUIDLE_CACHE_BUSY (8) ++#define CPUIDLE_DIFF_CPU (16) ++#define CPUIDLE_THREAD_BUSY (32) ++#define CPUIDLE_DIFF_NODE (64) ++ ++/* ++ * The best idle CPU is chosen according to the CPUIDLE ranking above where the ++ * lowest value would give the most suitable CPU to schedule p onto next. The ++ * order works out to be the following: ++ * ++ * Same thread, idle or busy cache, idle or busy threads ++ * Other core, same cache, idle or busy cache, idle threads. ++ * Same node, other CPU, idle cache, idle threads. ++ * Same node, other CPU, busy cache, idle threads. ++ * Other core, same cache, busy threads. ++ * Same node, other CPU, busy threads. ++ * Other node, other CPU, idle cache, idle threads. ++ * Other node, other CPU, busy cache, idle threads. ++ * Other node, other CPU, busy threads. ++ */ ++static int best_mask_cpu(int best_cpu, struct rq *rq, cpumask_t *tmpmask) ++{ ++ int best_ranking = CPUIDLE_DIFF_NODE | CPUIDLE_THREAD_BUSY | ++ CPUIDLE_DIFF_CPU | CPUIDLE_CACHE_BUSY | CPUIDLE_DIFF_CORE | ++ CPUIDLE_DIFF_CORE_LLC | CPUIDLE_DIFF_THREAD; ++ int cpu_tmp; ++ ++ if (cpumask_test_cpu(best_cpu, tmpmask)) ++ goto out; ++ ++ for_each_cpu(cpu_tmp, tmpmask) { ++ int ranking, locality; ++ struct rq *tmp_rq; ++ ++ ranking = 0; ++ tmp_rq = cpu_rq(cpu_tmp); ++ ++ locality = rq->cpu_locality[cpu_tmp]; ++#ifdef CONFIG_NUMA ++ if (locality > LOCALITY_SMP) ++ ranking |= CPUIDLE_DIFF_NODE; ++ else ++#endif ++ if (locality > LOCALITY_MC) ++ ranking |= CPUIDLE_DIFF_CPU; ++#ifdef CONFIG_SCHED_MC ++ else if (locality == LOCALITY_MC_LLC) ++ ranking |= CPUIDLE_DIFF_CORE_LLC; ++ else if (locality == LOCALITY_MC) ++ ranking |= CPUIDLE_DIFF_CORE; ++ if (!(tmp_rq->cache_idle(tmp_rq))) ++ ranking |= CPUIDLE_CACHE_BUSY; ++#endif ++#ifdef CONFIG_SCHED_SMT ++ if (locality == LOCALITY_SMT) ++ ranking |= CPUIDLE_DIFF_THREAD; ++#endif ++ if (ranking < best_ranking ++#ifdef CONFIG_SCHED_SMT ++ || (ranking == best_ranking && (tmp_rq->siblings_idle(tmp_rq))) ++#endif ++ ) { ++ best_cpu = cpu_tmp; ++ best_ranking = ranking; ++ } ++ } ++out: ++ return best_cpu; ++} ++ ++bool cpus_share_cache(int this_cpu, int that_cpu) ++{ ++ struct rq *this_rq = cpu_rq(this_cpu); ++ ++ return (this_rq->cpu_locality[that_cpu] < LOCALITY_SMP); ++} ++ ++/* As per resched_curr but only will resched idle task */ ++static inline void resched_idle(struct rq *rq) ++{ ++ if (test_tsk_need_resched(rq->idle)) ++ return; ++ ++ rq->preempt = rq->idle; ++ ++ set_tsk_need_resched(rq->idle); ++ ++ if (rq_local(rq)) { ++ set_preempt_need_resched(); ++ return; ++ } ++ ++ smp_sched_reschedule(rq->cpu); ++} ++ ++static struct rq *resched_best_idle(struct task_struct *p, int cpu) ++{ ++ cpumask_t tmpmask; ++ struct rq *rq; ++ int best_cpu; ++ ++ cpumask_and(&tmpmask, p->cpus_ptr, &cpu_idle_map); ++ best_cpu = best_mask_cpu(cpu, task_rq(p), &tmpmask); ++ rq = cpu_rq(best_cpu); ++ if (!smt_schedule(p, rq)) ++ return NULL; ++ rq->preempt = p; ++ resched_idle(rq); ++ return rq; ++} ++ ++static inline void resched_suitable_idle(struct task_struct *p) ++{ ++ if (suitable_idle_cpus(p)) ++ resched_best_idle(p, task_cpu(p)); ++} ++ ++static inline struct rq *rq_order(struct rq *rq, int cpu) ++{ ++ return rq->rq_order[cpu]; ++} ++#else /* CONFIG_SMP */ ++static inline void set_cpuidle_map(int cpu) ++{ ++} ++ ++static inline void clear_cpuidle_map(int cpu) ++{ ++} ++ ++static inline bool suitable_idle_cpus(struct task_struct *p) ++{ ++ return uprq->curr == uprq->idle; ++} ++ ++static inline void resched_suitable_idle(struct task_struct *p) ++{ ++} ++ ++static inline void resched_curr(struct rq *rq) ++{ ++ resched_task(rq->curr); ++} ++ ++static inline void resched_if_idle(struct rq *rq) ++{ ++} ++ ++static inline bool rq_local(struct rq *rq) ++{ ++ return true; ++} ++ ++static inline struct rq *rq_order(struct rq *rq, int cpu) ++{ ++ return rq; ++} ++ ++static inline bool smt_schedule(struct task_struct *p, struct rq *rq) ++{ ++ return true; ++} ++#endif /* CONFIG_SMP */ ++ ++static inline int normal_prio(struct task_struct *p) ++{ ++ if (has_rt_policy(p)) ++ return MAX_RT_PRIO - 1 - p->rt_priority; ++ if (idleprio_task(p)) ++ return IDLE_PRIO; ++ if (iso_task(p)) ++ return ISO_PRIO; ++ return NORMAL_PRIO; ++} ++ ++/* ++ * Calculate the current priority, i.e. the priority ++ * taken into account by the scheduler. This value might ++ * be boosted by RT tasks as it will be RT if the task got ++ * RT-boosted. If not then it returns p->normal_prio. ++ */ ++static int effective_prio(struct task_struct *p) ++{ ++ p->normal_prio = normal_prio(p); ++ /* ++ * If we are RT tasks or we were boosted to RT priority, ++ * keep the priority unchanged. Otherwise, update priority ++ * to the normal priority: ++ */ ++ if (!rt_prio(p->prio)) ++ return p->normal_prio; ++ return p->prio; ++} ++ ++/* ++ * activate_task - move a task to the runqueue. Enter with rq locked. ++ */ ++static void activate_task(struct rq *rq, struct task_struct *p, int flags) ++{ ++ resched_if_idle(rq); ++ ++ /* ++ * Sleep time is in units of nanosecs, so shift by 20 to get a ++ * milliseconds-range estimation of the amount of time that the task ++ * spent sleeping: ++ */ ++ if (unlikely(prof_on == SLEEP_PROFILING)) { ++ if (p->state == TASK_UNINTERRUPTIBLE) ++ profile_hits(SLEEP_PROFILING, (void *)get_wchan(p), ++ (rq->niffies - p->last_ran) >> 20); ++ } ++ ++ p->prio = effective_prio(p); ++ if (task_contributes_to_load(p)) ++ rq->nr_uninterruptible--; ++ ++ enqueue_task(rq, p, flags); ++ p->on_rq = TASK_ON_RQ_QUEUED; ++} ++ ++/* ++ * deactivate_task - If it's running, it's not on the runqueue and we can just ++ * decrement the nr_running. Enter with rq locked. ++ */ ++static inline void deactivate_task(struct task_struct *p, struct rq *rq) ++{ ++ if (task_contributes_to_load(p)) ++ rq->nr_uninterruptible++; ++ ++ p->on_rq = 0; ++ sched_info_dequeued(rq, p); ++ /* deactivate_task is always DEQUEUE_SLEEP in muqss */ ++ psi_dequeue(p, DEQUEUE_SLEEP); ++} ++ ++#ifdef CONFIG_SMP ++void set_task_cpu(struct task_struct *p, unsigned int new_cpu) ++{ ++ struct rq *rq; ++ ++ if (task_cpu(p) == new_cpu) ++ return; ++ ++ /* Do NOT call set_task_cpu on a currently queued task as we will not ++ * be reliably holding the rq lock after changing CPU. */ ++ BUG_ON(task_queued(p)); ++ rq = task_rq(p); ++ ++#ifdef CONFIG_LOCKDEP ++ /* ++ * The caller should hold either p->pi_lock or rq->lock, when changing ++ * a task's CPU. ->pi_lock for waking tasks, rq->lock for runnable tasks. ++ * ++ * Furthermore, all task_rq users should acquire both locks, see ++ * task_rq_lock(). ++ */ ++ WARN_ON_ONCE(debug_locks && !(lockdep_is_held(&p->pi_lock) || ++ lockdep_is_held(rq->lock))); ++#endif ++ ++ trace_sched_migrate_task(p, new_cpu); ++ rseq_migrate(p); ++ perf_event_task_migrate(p); ++ ++ /* ++ * After ->cpu is set up to a new value, task_rq_lock(p, ...) can be ++ * successfully executed on another CPU. We must ensure that updates of ++ * per-task data have been completed by this moment. ++ */ ++ smp_wmb(); ++ ++ p->wake_cpu = new_cpu; ++ ++ if (task_running(rq, p)) { ++ /* ++ * We should only be calling this on a running task if we're ++ * holding rq lock. ++ */ ++ lockdep_assert_held(rq->lock); ++ ++ /* ++ * We can't change the task_thread_info CPU on a running task ++ * as p will still be protected by the rq lock of the CPU it ++ * is still running on so we only set the wake_cpu for it to be ++ * lazily updated once off the CPU. ++ */ ++ return; ++ } ++ ++#ifdef CONFIG_THREAD_INFO_IN_TASK ++ WRITE_ONCE(p->cpu, new_cpu); ++#else ++ WRITE_ONCE(task_thread_info(p)->cpu, new_cpu); ++#endif ++ /* We're no longer protecting p after this point since we're holding ++ * the wrong runqueue lock. */ ++} ++#endif /* CONFIG_SMP */ ++ ++/* ++ * Move a task off the runqueue and take it to a cpu for it will ++ * become the running task. ++ */ ++static inline void take_task(struct rq *rq, int cpu, struct task_struct *p) ++{ ++ struct rq *p_rq = task_rq(p); ++ ++ dequeue_task(p_rq, p, DEQUEUE_SAVE); ++ if (p_rq != rq) { ++ sched_info_dequeued(p_rq, p); ++ sched_info_queued(rq, p); ++ } ++ set_task_cpu(p, cpu); ++} ++ ++/* ++ * Returns a descheduling task to the runqueue unless it is being ++ * deactivated. ++ */ ++static inline void return_task(struct task_struct *p, struct rq *rq, ++ int cpu, bool deactivate) ++{ ++ if (deactivate) ++ deactivate_task(p, rq); ++ else { ++#ifdef CONFIG_SMP ++ /* ++ * set_task_cpu was called on the running task that doesn't ++ * want to deactivate so it has to be enqueued to a different ++ * CPU and we need its lock. Tag it to be moved with as the ++ * lock is dropped in finish_lock_switch. ++ */ ++ if (unlikely(p->wake_cpu != cpu)) ++ WRITE_ONCE(p->on_rq, TASK_ON_RQ_MIGRATING); ++ else ++#endif ++ enqueue_task(rq, p, ENQUEUE_RESTORE); ++ } ++} ++ ++/* Enter with rq lock held. We know p is on the local cpu */ ++static inline void __set_tsk_resched(struct task_struct *p) ++{ ++ set_tsk_need_resched(p); ++ set_preempt_need_resched(); ++} ++ ++/** ++ * task_curr - is this task currently executing on a CPU? ++ * @p: the task in question. ++ * ++ * Return: 1 if the task is currently executing. 0 otherwise. ++ */ ++inline int task_curr(const struct task_struct *p) ++{ ++ return cpu_curr(task_cpu(p)) == p; ++} ++ ++#ifdef CONFIG_SMP ++/* ++ * wait_task_inactive - wait for a thread to unschedule. ++ * ++ * If @match_state is nonzero, it's the @p->state value just checked and ++ * not expected to change. If it changes, i.e. @p might have woken up, ++ * then return zero. When we succeed in waiting for @p to be off its CPU, ++ * we return a positive number (its total switch count). If a second call ++ * a short while later returns the same number, the caller can be sure that ++ * @p has remained unscheduled the whole time. ++ * ++ * The caller must ensure that the task *will* unschedule sometime soon, ++ * else this function might spin for a *long* time. This function can't ++ * be called with interrupts off, or it may introduce deadlock with ++ * smp_call_function() if an IPI is sent by the same process we are ++ * waiting to become inactive. ++ */ ++unsigned long wait_task_inactive(struct task_struct *p, long match_state) ++{ ++ int running, queued; ++ struct rq_flags rf; ++ unsigned long ncsw; ++ struct rq *rq; ++ ++ for (;;) { ++ rq = task_rq(p); ++ ++ /* ++ * If the task is actively running on another CPU ++ * still, just relax and busy-wait without holding ++ * any locks. ++ * ++ * NOTE! Since we don't hold any locks, it's not ++ * even sure that "rq" stays as the right runqueue! ++ * But we don't care, since this will return false ++ * if the runqueue has changed and p is actually now ++ * running somewhere else! ++ */ ++ while (task_running(rq, p)) { ++ if (match_state && unlikely(p->state != match_state)) ++ return 0; ++ cpu_relax(); ++ } ++ ++ /* ++ * Ok, time to look more closely! We need the rq ++ * lock now, to be *sure*. If we're wrong, we'll ++ * just go back and repeat. ++ */ ++ rq = task_rq_lock(p, &rf); ++ trace_sched_wait_task(p); ++ running = task_running(rq, p); ++ queued = task_on_rq_queued(p); ++ ncsw = 0; ++ if (!match_state || p->state == match_state) ++ ncsw = p->nvcsw | LONG_MIN; /* sets MSB */ ++ task_rq_unlock(rq, p, &rf); ++ ++ /* ++ * If it changed from the expected state, bail out now. ++ */ ++ if (unlikely(!ncsw)) ++ break; ++ ++ /* ++ * Was it really running after all now that we ++ * checked with the proper locks actually held? ++ * ++ * Oops. Go back and try again.. ++ */ ++ if (unlikely(running)) { ++ cpu_relax(); ++ continue; ++ } ++ ++ /* ++ * It's not enough that it's not actively running, ++ * it must be off the runqueue _entirely_, and not ++ * preempted! ++ * ++ * So if it was still runnable (but just not actively ++ * running right now), it's preempted, and we should ++ * yield - it could be a while. ++ */ ++ if (unlikely(queued)) { ++ ktime_t to = NSEC_PER_SEC / HZ; ++ ++ set_current_state(TASK_UNINTERRUPTIBLE); ++ schedule_hrtimeout(&to, HRTIMER_MODE_REL); ++ continue; ++ } ++ ++ /* ++ * Ahh, all good. It wasn't running, and it wasn't ++ * runnable, which means that it will never become ++ * running in the future either. We're all done! ++ */ ++ break; ++ } ++ ++ return ncsw; ++} ++ ++/*** ++ * kick_process - kick a running thread to enter/exit the kernel ++ * @p: the to-be-kicked thread ++ * ++ * Cause a process which is running on another CPU to enter ++ * kernel-mode, without any delay. (to get signals handled.) ++ * ++ * NOTE: this function doesn't have to take the runqueue lock, ++ * because all it wants to ensure is that the remote task enters ++ * the kernel. If the IPI races and the task has been migrated ++ * to another CPU then no harm is done and the purpose has been ++ * achieved as well. ++ */ ++void kick_process(struct task_struct *p) ++{ ++ int cpu; ++ ++ preempt_disable(); ++ cpu = task_cpu(p); ++ if ((cpu != smp_processor_id()) && task_curr(p)) ++ smp_sched_reschedule(cpu); ++ preempt_enable(); ++} ++EXPORT_SYMBOL_GPL(kick_process); ++#endif ++ ++/* ++ * RT tasks preempt purely on priority. SCHED_NORMAL tasks preempt on the ++ * basis of earlier deadlines. SCHED_IDLEPRIO don't preempt anything else or ++ * between themselves, they cooperatively multitask. An idle rq scores as ++ * prio PRIO_LIMIT so it is always preempted. ++ */ ++static inline bool ++can_preempt(struct task_struct *p, int prio, u64 deadline) ++{ ++ /* Better static priority RT task or better policy preemption */ ++ if (p->prio < prio) ++ return true; ++ if (p->prio > prio) ++ return false; ++ if (p->policy == SCHED_BATCH) ++ return false; ++ /* SCHED_NORMAL and ISO will preempt based on deadline */ ++ if (!deadline_before(p->deadline, deadline)) ++ return false; ++ return true; ++} ++ ++#ifdef CONFIG_SMP ++ ++static inline bool is_per_cpu_kthread(struct task_struct *p) ++{ ++ if (!(p->flags & PF_KTHREAD)) ++ return false; ++ ++ if (p->nr_cpus_allowed != 1) ++ return false; ++ ++ return true; ++} ++ ++/* ++ * Per-CPU kthreads are allowed to run on !active && online CPUs, see ++ * __set_cpus_allowed_ptr(). ++ */ ++static inline bool is_cpu_allowed(struct task_struct *p, int cpu) ++{ ++ if (!cpumask_test_cpu(cpu, p->cpus_ptr)) ++ return false; ++ ++ if (is_per_cpu_kthread(p)) ++ return cpu_online(cpu); ++ ++ return cpu_active(cpu); ++} ++ ++/* ++ * Check to see if p can run on cpu, and if not, whether there are any online ++ * CPUs it can run on instead. This only happens with the hotplug threads that ++ * bring up the CPUs. ++ */ ++static inline bool sched_other_cpu(struct task_struct *p, int cpu) ++{ ++ if (likely(cpumask_test_cpu(cpu, p->cpus_ptr))) ++ return false; ++ if (p->nr_cpus_allowed == 1) { ++ cpumask_t valid_mask; ++ ++ cpumask_and(&valid_mask, p->cpus_ptr, cpu_online_mask); ++ if (unlikely(cpumask_empty(&valid_mask))) ++ return false; ++ } ++ return true; ++} ++ ++static inline bool needs_other_cpu(struct task_struct *p, int cpu) ++{ ++ if (cpumask_test_cpu(cpu, p->cpus_ptr)) ++ return false; ++ return true; ++} ++ ++#define cpu_online_map (*(cpumask_t *)cpu_online_mask) ++ ++static void try_preempt(struct task_struct *p, struct rq *this_rq) ++{ ++ int i, this_entries = rq_load(this_rq); ++ cpumask_t tmp; ++ ++ if (suitable_idle_cpus(p) && resched_best_idle(p, task_cpu(p))) ++ return; ++ ++ /* IDLEPRIO tasks never preempt anything but idle */ ++ if (p->policy == SCHED_IDLEPRIO) ++ return; ++ ++ cpumask_and(&tmp, &cpu_online_map, p->cpus_ptr); ++ ++ for (i = 0; i < num_online_cpus(); i++) { ++ struct rq *rq = this_rq->cpu_order[i]; ++ ++ if (!cpumask_test_cpu(rq->cpu, &tmp)) ++ continue; ++ ++ if (!sched_interactive && rq != this_rq && rq_load(rq) <= this_entries) ++ continue; ++ if (smt_schedule(p, rq) && can_preempt(p, rq->rq_prio, rq->rq_deadline)) { ++ /* We set rq->preempting lockless, it's a hint only */ ++ rq->preempting = p; ++ resched_curr(rq); ++ return; ++ } ++ } ++} ++ ++static int __set_cpus_allowed_ptr(struct task_struct *p, ++ const struct cpumask *new_mask, bool check); ++#else /* CONFIG_SMP */ ++static inline bool needs_other_cpu(struct task_struct *p, int cpu) ++{ ++ return false; ++} ++ ++static void try_preempt(struct task_struct *p, struct rq *this_rq) ++{ ++ if (p->policy == SCHED_IDLEPRIO) ++ return; ++ if (can_preempt(p, uprq->rq_prio, uprq->rq_deadline)) ++ resched_curr(uprq); ++} ++ ++static inline int __set_cpus_allowed_ptr(struct task_struct *p, ++ const struct cpumask *new_mask, bool check) ++{ ++ return set_cpus_allowed_ptr(p, new_mask); ++} ++#endif /* CONFIG_SMP */ ++ ++static void ++ttwu_stat(struct task_struct *p, int cpu, int wake_flags) ++{ ++ struct rq *rq; ++ ++ if (!schedstat_enabled()) ++ return; ++ ++ rq = this_rq(); ++ ++#ifdef CONFIG_SMP ++ if (cpu == rq->cpu) { ++ __schedstat_inc(rq->ttwu_local); ++ } else { ++ struct sched_domain *sd; ++ ++ rcu_read_lock(); ++ for_each_domain(rq->cpu, sd) { ++ if (cpumask_test_cpu(cpu, sched_domain_span(sd))) { ++ __schedstat_inc(sd->ttwu_wake_remote); ++ break; ++ } ++ } ++ rcu_read_unlock(); ++ } ++ ++#endif /* CONFIG_SMP */ ++ ++ __schedstat_inc(rq->ttwu_count); ++} ++ ++/* ++ * Mark the task runnable and perform wakeup-preemption. ++ */ ++static void ttwu_do_wakeup(struct rq *rq, struct task_struct *p, int wake_flags) ++{ ++ /* ++ * Sync wakeups (i.e. those types of wakeups where the waker ++ * has indicated that it will leave the CPU in short order) ++ * don't trigger a preemption if there are no idle cpus, ++ * instead waiting for current to deschedule. ++ */ ++ if (wake_flags & WF_SYNC) ++ resched_suitable_idle(p); ++ else ++ try_preempt(p, rq); ++ p->state = TASK_RUNNING; ++ trace_sched_wakeup(p); ++} ++ ++static void ++ttwu_do_activate(struct rq *rq, struct task_struct *p, int wake_flags) ++{ ++ int en_flags = ENQUEUE_WAKEUP; ++ ++ lockdep_assert_held(rq->lock); ++ ++#ifdef CONFIG_SMP ++ if (p->sched_contributes_to_load) ++ rq->nr_uninterruptible--; ++ ++ if (wake_flags & WF_MIGRATED) ++ en_flags |= ENQUEUE_MIGRATED; ++#endif ++ ++ activate_task(rq, p, en_flags); ++ ttwu_do_wakeup(rq, p, wake_flags); ++} ++ ++/* ++ * Called in case the task @p isn't fully descheduled from its runqueue, ++ * in this case we must do a remote wakeup. Its a 'light' wakeup though, ++ * since all we need to do is flip p->state to TASK_RUNNING, since ++ * the task is still ->on_rq. ++ */ ++static int ttwu_remote(struct task_struct *p, int wake_flags) ++{ ++ struct rq *rq; ++ int ret = 0; ++ ++ rq = __task_rq_lock(p, NULL); ++ if (likely(task_on_rq_queued(p))) { ++ ttwu_do_wakeup(rq, p, wake_flags); ++ ret = 1; ++ } ++ __task_rq_unlock(rq, NULL); ++ ++ return ret; ++} ++ ++#ifdef CONFIG_SMP ++void sched_ttwu_pending(void) ++{ ++ struct rq *rq = this_rq(); ++ struct llist_node *llist = llist_del_all(&rq->wake_list); ++ struct task_struct *p, *t; ++ struct rq_flags rf; ++ ++ if (!llist) ++ return; ++ ++ rq_lock_irqsave(rq, &rf); ++ ++ llist_for_each_entry_safe(p, t, llist, wake_entry) ++ ttwu_do_activate(rq, p, 0); ++ ++ rq_unlock_irqrestore(rq, &rf); ++} ++ ++void scheduler_ipi(void) ++{ ++ /* ++ * Fold TIF_NEED_RESCHED into the preempt_count; anybody setting ++ * TIF_NEED_RESCHED remotely (for the first time) will also send ++ * this IPI. ++ */ ++ preempt_fold_need_resched(); ++ ++ if (llist_empty(&this_rq()->wake_list) && (!idle_cpu(smp_processor_id()) || need_resched())) ++ return; ++ ++ /* ++ * Not all reschedule IPI handlers call irq_enter/irq_exit, since ++ * traditionally all their work was done from the interrupt return ++ * path. Now that we actually do some work, we need to make sure ++ * we do call them. ++ * ++ * Some archs already do call them, luckily irq_enter/exit nest ++ * properly. ++ * ++ * Arguably we should visit all archs and update all handlers, ++ * however a fair share of IPIs are still resched only so this would ++ * somewhat pessimize the simple resched case. ++ */ ++ irq_enter(); ++ sched_ttwu_pending(); ++ irq_exit(); ++} ++ ++static void ttwu_queue_remote(struct task_struct *p, int cpu, int wake_flags) ++{ ++ struct rq *rq = cpu_rq(cpu); ++ ++ if (llist_add(&p->wake_entry, &cpu_rq(cpu)->wake_list)) { ++ if (!set_nr_if_polling(rq->idle)) ++ smp_sched_reschedule(cpu); ++ else ++ trace_sched_wake_idle_without_ipi(cpu); ++ } ++} ++ ++void wake_up_if_idle(int cpu) ++{ ++ struct rq *rq = cpu_rq(cpu); ++ struct rq_flags rf; ++ ++ rcu_read_lock(); ++ ++ if (!is_idle_task(rcu_dereference(rq->curr))) ++ goto out; ++ ++ if (set_nr_if_polling(rq->idle)) { ++ trace_sched_wake_idle_without_ipi(cpu); ++ } else { ++ rq_lock_irqsave(rq, &rf); ++ if (likely(is_idle_task(rq->curr))) ++ smp_sched_reschedule(cpu); ++ /* Else cpu is not in idle, do nothing here */ ++ rq_unlock_irqrestore(rq, &rf); ++ } ++ ++out: ++ rcu_read_unlock(); ++} ++ ++static int valid_task_cpu(struct task_struct *p) ++{ ++ cpumask_t valid_mask; ++ ++ if (p->flags & PF_KTHREAD) ++ cpumask_and(&valid_mask, p->cpus_ptr, cpu_all_mask); ++ else ++ cpumask_and(&valid_mask, p->cpus_ptr, cpu_active_mask); ++ ++ if (unlikely(!cpumask_weight(&valid_mask))) { ++ /* We shouldn't be hitting this any more */ ++ printk(KERN_WARNING "SCHED: No cpumask for %s/%d weight %d\n", p->comm, ++ p->pid, cpumask_weight(p->cpus_ptr)); ++ return cpumask_any(p->cpus_ptr); ++ } ++ return cpumask_any(&valid_mask); ++} ++ ++/* ++ * For a task that's just being woken up we have a valuable balancing ++ * opportunity so choose the nearest cache most lightly loaded runqueue. ++ * Entered with rq locked and returns with the chosen runqueue locked. ++ */ ++static inline int select_best_cpu(struct task_struct *p) ++{ ++ unsigned int idlest = ~0U; ++ struct rq *rq = NULL; ++ int i; ++ ++ if (suitable_idle_cpus(p)) { ++ int cpu = task_cpu(p); ++ ++ if (unlikely(needs_other_cpu(p, cpu))) ++ cpu = valid_task_cpu(p); ++ rq = resched_best_idle(p, cpu); ++ if (likely(rq)) ++ return rq->cpu; ++ } ++ ++ for (i = 0; i < num_online_cpus(); i++) { ++ struct rq *other_rq = task_rq(p)->cpu_order[i]; ++ int entries; ++ ++ if (!other_rq->online) ++ continue; ++ if (needs_other_cpu(p, other_rq->cpu)) ++ continue; ++ entries = rq_load(other_rq); ++ if (entries >= idlest) ++ continue; ++ idlest = entries; ++ rq = other_rq; ++ } ++ if (unlikely(!rq)) ++ return task_cpu(p); ++ return rq->cpu; ++} ++#else /* CONFIG_SMP */ ++static int valid_task_cpu(struct task_struct *p) ++{ ++ return 0; ++} ++ ++static inline int select_best_cpu(struct task_struct *p) ++{ ++ return 0; ++} ++ ++static struct rq *resched_best_idle(struct task_struct *p, int cpu) ++{ ++ return NULL; ++} ++#endif /* CONFIG_SMP */ ++ ++static void ttwu_queue(struct task_struct *p, int cpu, int wake_flags) ++{ ++ struct rq *rq = cpu_rq(cpu); ++ ++#if defined(CONFIG_SMP) ++ if (!cpus_share_cache(smp_processor_id(), cpu)) { ++ sched_clock_cpu(cpu); /* Sync clocks across CPUs */ ++ ttwu_queue_remote(p, cpu, wake_flags); ++ return; ++ } ++#endif ++ rq_lock(rq); ++ ttwu_do_activate(rq, p, wake_flags); ++ rq_unlock(rq); ++} ++ ++/*** ++ * try_to_wake_up - wake up a thread ++ * @p: the thread to be awakened ++ * @state: the mask of task states that can be woken ++ * @wake_flags: wake modifier flags (WF_*) ++ * ++ * Put it on the run-queue if it's not already there. The "current" ++ * thread is always on the run-queue (except when the actual ++ * re-schedule is in progress), and as such you're allowed to do ++ * the simpler "current->state = TASK_RUNNING" to mark yourself ++ * runnable without the overhead of this. ++ * ++ * Return: %true if @p was woken up, %false if it was already running. ++ * or @state didn't match @p's state. ++ */ ++static int ++try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags) ++{ ++ unsigned long flags; ++ int cpu, success = 0; ++ ++ preempt_disable(); ++ if (p == current) { ++ /* ++ * We're waking current, this means 'p->on_rq' and 'task_cpu(p) ++ * == smp_processor_id()'. Together this means we can special ++ * case the whole 'p->on_rq && ttwu_remote()' case below ++ * without taking any locks. ++ * ++ * In particular: ++ * - we rely on Program-Order guarantees for all the ordering, ++ * - we're serialized against set_special_state() by virtue of ++ * it disabling IRQs (this allows not taking ->pi_lock). ++ */ ++ if (!(p->state & state)) ++ goto out; ++ ++ success = 1; ++ cpu = task_cpu(p); ++ trace_sched_waking(p); ++ p->state = TASK_RUNNING; ++ trace_sched_wakeup(p); ++ goto out; ++ } ++ ++ /* ++ * If we are going to wake up a thread waiting for CONDITION we ++ * need to ensure that CONDITION=1 done by the caller can not be ++ * reordered with p->state check below. This pairs with mb() in ++ * set_current_state() the waiting thread does. ++ */ ++ raw_spin_lock_irqsave(&p->pi_lock, flags); ++ smp_mb__after_spinlock(); ++ if (!(p->state & state)) ++ goto unlock; ++ ++ trace_sched_waking(p); ++ ++ /* We're going to change ->state: */ ++ success = 1; ++ cpu = task_cpu(p); ++ ++ /* ++ * Ensure we load p->on_rq _after_ p->state, otherwise it would ++ * be possible to, falsely, observe p->on_rq == 0 and get stuck ++ * in smp_cond_load_acquire() below. ++ * ++ * sched_ttwu_pending() try_to_wake_up() ++ * STORE p->on_rq = 1 LOAD p->state ++ * UNLOCK rq->lock ++ * ++ * __schedule() (switch to task 'p') ++ * LOCK rq->lock smp_rmb(); ++ * smp_mb__after_spinlock(); ++ * UNLOCK rq->lock ++ * ++ * [task p] ++ * STORE p->state = UNINTERRUPTIBLE LOAD p->on_rq ++ * ++ * Pairs with the LOCK+smp_mb__after_spinlock() on rq->lock in ++ * __schedule(). See the comment for smp_mb__after_spinlock(). ++ */ ++ smp_rmb(); ++ if (p->on_rq && ttwu_remote(p, wake_flags)) ++ goto unlock; ++ ++#ifdef CONFIG_SMP ++ /* ++ * Ensure we load p->on_cpu _after_ p->on_rq, otherwise it would be ++ * possible to, falsely, observe p->on_cpu == 0. ++ * ++ * One must be running (->on_cpu == 1) in order to remove oneself ++ * from the runqueue. ++ * ++ * __schedule() (switch to task 'p') try_to_wake_up() ++ * STORE p->on_cpu = 1 LOAD p->on_rq ++ * UNLOCK rq->lock ++ * ++ * __schedule() (put 'p' to sleep) ++ * LOCK rq->lock smp_rmb(); ++ * smp_mb__after_spinlock(); ++ * STORE p->on_rq = 0 LOAD p->on_cpu ++ * ++ * Pairs with the LOCK+smp_mb__after_spinlock() on rq->lock in ++ * __schedule(). See the comment for smp_mb__after_spinlock(). ++ */ ++ smp_rmb(); ++ ++ /* ++ * If the owning (remote) CPU is still in the middle of schedule() with ++ * this task as prev, wait until its done referencing the task. ++ * ++ * Pairs with the smp_store_release() in finish_task(). ++ * ++ * This ensures that tasks getting woken will be fully ordered against ++ * their previous state and preserve Program Order. ++ */ ++ smp_cond_load_acquire(&p->on_cpu, !VAL); ++ ++ p->sched_contributes_to_load = !!task_contributes_to_load(p); ++ p->state = TASK_WAKING; ++ ++ if (p->in_iowait) { ++ delayacct_blkio_end(p); ++ atomic_dec(&task_rq(p)->nr_iowait); ++ } ++ ++ cpu = select_best_cpu(p); ++ if (task_cpu(p) != cpu) { ++ wake_flags |= WF_MIGRATED; ++ psi_ttwu_dequeue(p); ++ set_task_cpu(p, cpu); ++ } ++ ++#else /* CONFIG_SMP */ ++ ++ if (p->in_iowait) { ++ delayacct_blkio_end(p); ++ atomic_dec(&task_rq(p)->nr_iowait); ++ } ++ ++#endif /* CONFIG_SMP */ ++ ++ ttwu_queue(p, cpu, wake_flags); ++unlock: ++ raw_spin_unlock_irqrestore(&p->pi_lock, flags); ++out: ++ if (success) ++ ttwu_stat(p, cpu, wake_flags); ++ preempt_enable(); ++ ++ return success; ++} ++ ++/** ++ * wake_up_process - Wake up a specific process ++ * @p: The process to be woken up. ++ * ++ * Attempt to wake up the nominated process and move it to the set of runnable ++ * processes. ++ * ++ * Return: 1 if the process was woken up, 0 if it was already running. ++ * ++ * This function executes a full memory barrier before accessing the task state. ++ */ ++int wake_up_process(struct task_struct *p) ++{ ++ return try_to_wake_up(p, TASK_NORMAL, 0); ++} ++EXPORT_SYMBOL(wake_up_process); ++ ++int wake_up_state(struct task_struct *p, unsigned int state) ++{ ++ return try_to_wake_up(p, state, 0); ++} ++ ++static void time_slice_expired(struct task_struct *p, struct rq *rq); ++ ++/* ++ * Perform scheduler related setup for a newly forked process p. ++ * p is forked by current. ++ */ ++int sched_fork(unsigned long __maybe_unused clone_flags, struct task_struct *p) ++{ ++ unsigned long flags; ++ ++#ifdef CONFIG_PREEMPT_NOTIFIERS ++ INIT_HLIST_HEAD(&p->preempt_notifiers); ++#endif ++ ++#ifdef CONFIG_COMPACTION ++ p->capture_control = NULL; ++#endif ++ ++ /* ++ * We mark the process as NEW here. This guarantees that ++ * nobody will actually run it, and a signal or other external ++ * event cannot wake it up and insert it on the runqueue either. ++ */ ++ p->state = TASK_NEW; ++ ++ /* ++ * The process state is set to the same value of the process executing ++ * do_fork() code. That is running. This guarantees that nobody will ++ * actually run it, and a signal or other external event cannot wake ++ * it up and insert it on the runqueue either. ++ */ ++ ++ /* Should be reset in fork.c but done here for ease of MuQSS patching */ ++ p->on_cpu = ++ p->on_rq = ++ p->utime = ++ p->stime = ++ p->sched_time = ++ p->stime_ns = ++ p->utime_ns = 0; ++ skiplist_node_init(&p->node); ++ ++ /* ++ * Revert to default priority/policy on fork if requested. ++ */ ++ if (unlikely(p->sched_reset_on_fork)) { ++ if (p->policy == SCHED_FIFO || p->policy == SCHED_RR || p-> policy == SCHED_ISO) { ++ p->policy = SCHED_NORMAL; ++ p->normal_prio = normal_prio(p); ++ } ++ ++ if (PRIO_TO_NICE(p->static_prio) < 0) { ++ p->static_prio = NICE_TO_PRIO(0); ++ p->normal_prio = p->static_prio; ++ } ++ ++ /* ++ * We don't need the reset flag anymore after the fork. It has ++ * fulfilled its duty: ++ */ ++ p->sched_reset_on_fork = 0; ++ } ++ ++ /* ++ * Silence PROVE_RCU. ++ */ ++ raw_spin_lock_irqsave(&p->pi_lock, flags); ++ set_task_cpu(p, smp_processor_id()); ++ raw_spin_unlock_irqrestore(&p->pi_lock, flags); ++ ++#ifdef CONFIG_SCHED_INFO ++ if (unlikely(sched_info_on())) ++ memset(&p->sched_info, 0, sizeof(p->sched_info)); ++#endif ++ init_task_preempt_count(p); ++ ++ return 0; ++} ++ ++#ifdef CONFIG_SCHEDSTATS ++ ++DEFINE_STATIC_KEY_FALSE(sched_schedstats); ++static bool __initdata __sched_schedstats = false; ++ ++static void set_schedstats(bool enabled) ++{ ++ if (enabled) ++ static_branch_enable(&sched_schedstats); ++ else ++ static_branch_disable(&sched_schedstats); ++} ++ ++void force_schedstat_enabled(void) ++{ ++ if (!schedstat_enabled()) { ++ pr_info("kernel profiling enabled schedstats, disable via kernel.sched_schedstats.\n"); ++ static_branch_enable(&sched_schedstats); ++ } ++} ++ ++static int __init setup_schedstats(char *str) ++{ ++ int ret = 0; ++ if (!str) ++ goto out; ++ ++ /* ++ * This code is called before jump labels have been set up, so we can't ++ * change the static branch directly just yet. Instead set a temporary ++ * variable so init_schedstats() can do it later. ++ */ ++ if (!strcmp(str, "enable")) { ++ __sched_schedstats = true; ++ ret = 1; ++ } else if (!strcmp(str, "disable")) { ++ __sched_schedstats = false; ++ ret = 1; ++ } ++out: ++ if (!ret) ++ pr_warn("Unable to parse schedstats=\n"); ++ ++ return ret; ++} ++__setup("schedstats=", setup_schedstats); ++ ++static void __init init_schedstats(void) ++{ ++ set_schedstats(__sched_schedstats); ++} ++ ++#ifdef CONFIG_PROC_SYSCTL ++int sysctl_schedstats(struct ctl_table *table, int write, ++ void __user *buffer, size_t *lenp, loff_t *ppos) ++{ ++ struct ctl_table t; ++ int err; ++ int state = static_branch_likely(&sched_schedstats); ++ ++ if (write && !capable(CAP_SYS_ADMIN)) ++ return -EPERM; ++ ++ t = *table; ++ t.data = &state; ++ err = proc_dointvec_minmax(&t, write, buffer, lenp, ppos); ++ if (err < 0) ++ return err; ++ if (write) ++ set_schedstats(state); ++ return err; ++} ++#endif /* CONFIG_PROC_SYSCTL */ ++#else /* !CONFIG_SCHEDSTATS */ ++static inline void init_schedstats(void) {} ++#endif /* CONFIG_SCHEDSTATS */ ++ ++static void update_cpu_clock_switch(struct rq *rq, struct task_struct *p); ++ ++static void account_task_cpu(struct rq *rq, struct task_struct *p) ++{ ++ update_clocks(rq); ++ /* This isn't really a context switch but accounting is the same */ ++ update_cpu_clock_switch(rq, p); ++ p->last_ran = rq->niffies; ++} ++ ++bool sched_smp_initialized __read_mostly; ++ ++static inline int hrexpiry_enabled(struct rq *rq) ++{ ++ if (unlikely(!cpu_active(cpu_of(rq)) || !sched_smp_initialized)) ++ return 0; ++ return hrtimer_is_hres_active(&rq->hrexpiry_timer); ++} ++ ++/* ++ * Use HR-timers to deliver accurate preemption points. ++ */ ++static inline void hrexpiry_clear(struct rq *rq) ++{ ++ if (!hrexpiry_enabled(rq)) ++ return; ++ if (hrtimer_active(&rq->hrexpiry_timer)) ++ hrtimer_cancel(&rq->hrexpiry_timer); ++} ++ ++/* ++ * High-resolution time_slice expiry. ++ * Runs from hardirq context with interrupts disabled. ++ */ ++static enum hrtimer_restart hrexpiry(struct hrtimer *timer) ++{ ++ struct rq *rq = container_of(timer, struct rq, hrexpiry_timer); ++ struct task_struct *p; ++ ++ /* This can happen during CPU hotplug / resume */ ++ if (unlikely(cpu_of(rq) != smp_processor_id())) ++ goto out; ++ ++ /* ++ * We're doing this without the runqueue lock but this should always ++ * be run on the local CPU. Time slice should run out in __schedule ++ * but we set it to zero here in case niffies is slightly less. ++ */ ++ p = rq->curr; ++ p->time_slice = 0; ++ __set_tsk_resched(p); ++out: ++ return HRTIMER_NORESTART; ++} ++ ++/* ++ * Called to set the hrexpiry timer state. ++ * ++ * called with irqs disabled from the local CPU only ++ */ ++static void hrexpiry_start(struct rq *rq, u64 delay) ++{ ++ if (!hrexpiry_enabled(rq)) ++ return; ++ ++ hrtimer_start(&rq->hrexpiry_timer, ns_to_ktime(delay), ++ HRTIMER_MODE_REL_PINNED); ++} ++ ++static void init_rq_hrexpiry(struct rq *rq) ++{ ++ hrtimer_init(&rq->hrexpiry_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); ++ rq->hrexpiry_timer.function = hrexpiry; ++} ++ ++static inline int rq_dither(struct rq *rq) ++{ ++ if (!hrexpiry_enabled(rq)) ++ return HALF_JIFFY_US; ++ return 0; ++} ++ ++/* ++ * wake_up_new_task - wake up a newly created task for the first time. ++ * ++ * This function will do some initial scheduler statistics housekeeping ++ * that must be done for every newly created context, then puts the task ++ * on the runqueue and wakes it. ++ */ ++void wake_up_new_task(struct task_struct *p) ++{ ++ struct task_struct *parent, *rq_curr; ++ struct rq *rq, *new_rq; ++ unsigned long flags; ++ ++ parent = p->parent; ++ ++ raw_spin_lock_irqsave(&p->pi_lock, flags); ++ p->state = TASK_RUNNING; ++ /* Task_rq can't change yet on a new task */ ++ new_rq = rq = task_rq(p); ++ if (unlikely(needs_other_cpu(p, task_cpu(p)))) { ++ set_task_cpu(p, valid_task_cpu(p)); ++ new_rq = task_rq(p); ++ } ++ ++ double_rq_lock(rq, new_rq); ++ rq_curr = rq->curr; ++ ++ /* ++ * Make sure we do not leak PI boosting priority to the child. ++ */ ++ p->prio = rq_curr->normal_prio; ++ ++ trace_sched_wakeup_new(p); ++ ++ /* ++ * Share the timeslice between parent and child, thus the ++ * total amount of pending timeslices in the system doesn't change, ++ * resulting in more scheduling fairness. If it's negative, it won't ++ * matter since that's the same as being 0. rq->rq_deadline is only ++ * modified within schedule() so it is always equal to ++ * current->deadline. ++ */ ++ account_task_cpu(rq, rq_curr); ++ p->last_ran = rq_curr->last_ran; ++ if (likely(rq_curr->policy != SCHED_FIFO)) { ++ rq_curr->time_slice /= 2; ++ if (rq_curr->time_slice < RESCHED_US) { ++ /* ++ * Forking task has run out of timeslice. Reschedule it and ++ * start its child with a new time slice and deadline. The ++ * child will end up running first because its deadline will ++ * be slightly earlier. ++ */ ++ __set_tsk_resched(rq_curr); ++ time_slice_expired(p, new_rq); ++ if (suitable_idle_cpus(p)) ++ resched_best_idle(p, task_cpu(p)); ++ else if (unlikely(rq != new_rq)) ++ try_preempt(p, new_rq); ++ } else { ++ p->time_slice = rq_curr->time_slice; ++ if (rq_curr == parent && rq == new_rq && !suitable_idle_cpus(p)) { ++ /* ++ * The VM isn't cloned, so we're in a good position to ++ * do child-runs-first in anticipation of an exec. This ++ * usually avoids a lot of COW overhead. ++ */ ++ __set_tsk_resched(rq_curr); ++ } else { ++ /* ++ * Adjust the hrexpiry since rq_curr will keep ++ * running and its timeslice has been shortened. ++ */ ++ hrexpiry_start(rq, US_TO_NS(rq_curr->time_slice)); ++ try_preempt(p, new_rq); ++ } ++ } ++ } else { ++ time_slice_expired(p, new_rq); ++ try_preempt(p, new_rq); ++ } ++ activate_task(new_rq, p, 0); ++ double_rq_unlock(rq, new_rq); ++ raw_spin_unlock_irqrestore(&p->pi_lock, flags); ++} ++ ++#ifdef CONFIG_PREEMPT_NOTIFIERS ++ ++static DEFINE_STATIC_KEY_FALSE(preempt_notifier_key); ++ ++void preempt_notifier_inc(void) ++{ ++ static_branch_inc(&preempt_notifier_key); ++} ++EXPORT_SYMBOL_GPL(preempt_notifier_inc); ++ ++void preempt_notifier_dec(void) ++{ ++ static_branch_dec(&preempt_notifier_key); ++} ++EXPORT_SYMBOL_GPL(preempt_notifier_dec); ++ ++/** ++ * preempt_notifier_register - tell me when current is being preempted & rescheduled ++ * @notifier: notifier struct to register ++ */ ++void preempt_notifier_register(struct preempt_notifier *notifier) ++{ ++ if (!static_branch_unlikely(&preempt_notifier_key)) ++ WARN(1, "registering preempt_notifier while notifiers disabled\n"); ++ ++ hlist_add_head(¬ifier->link, ¤t->preempt_notifiers); ++} ++EXPORT_SYMBOL_GPL(preempt_notifier_register); ++ ++/** ++ * preempt_notifier_unregister - no longer interested in preemption notifications ++ * @notifier: notifier struct to unregister ++ * ++ * This is *not* safe to call from within a preemption notifier. ++ */ ++void preempt_notifier_unregister(struct preempt_notifier *notifier) ++{ ++ hlist_del(¬ifier->link); ++} ++EXPORT_SYMBOL_GPL(preempt_notifier_unregister); ++ ++static void __fire_sched_in_preempt_notifiers(struct task_struct *curr) ++{ ++ struct preempt_notifier *notifier; ++ ++ hlist_for_each_entry(notifier, &curr->preempt_notifiers, link) ++ notifier->ops->sched_in(notifier, raw_smp_processor_id()); ++} ++ ++static __always_inline void fire_sched_in_preempt_notifiers(struct task_struct *curr) ++{ ++ if (static_branch_unlikely(&preempt_notifier_key)) ++ __fire_sched_in_preempt_notifiers(curr); ++} ++ ++static void ++__fire_sched_out_preempt_notifiers(struct task_struct *curr, ++ struct task_struct *next) ++{ ++ struct preempt_notifier *notifier; ++ ++ hlist_for_each_entry(notifier, &curr->preempt_notifiers, link) ++ notifier->ops->sched_out(notifier, next); ++} ++ ++static __always_inline void ++fire_sched_out_preempt_notifiers(struct task_struct *curr, ++ struct task_struct *next) ++{ ++ if (static_branch_unlikely(&preempt_notifier_key)) ++ __fire_sched_out_preempt_notifiers(curr, next); ++} ++ ++#else /* !CONFIG_PREEMPT_NOTIFIERS */ ++ ++static inline void fire_sched_in_preempt_notifiers(struct task_struct *curr) ++{ ++} ++ ++static inline void ++fire_sched_out_preempt_notifiers(struct task_struct *curr, ++ struct task_struct *next) ++{ ++} ++ ++#endif /* CONFIG_PREEMPT_NOTIFIERS */ ++ ++static inline void prepare_task(struct task_struct *next) ++{ ++ /* ++ * Claim the task as running, we do this before switching to it ++ * such that any running task will have this set. ++ */ ++ next->on_cpu = 1; ++} ++ ++static inline void finish_task(struct task_struct *prev) ++{ ++#ifdef CONFIG_SMP ++ /* ++ * After ->on_cpu is cleared, the task can be moved to a different CPU. ++ * We must ensure this doesn't happen until the switch is completely ++ * finished. ++ * ++ * In particular, the load of prev->state in finish_task_switch() must ++ * happen before this. ++ * ++ * Pairs with the smp_cond_load_acquire() in try_to_wake_up(). ++ */ ++ smp_store_release(&prev->on_cpu, 0); ++#endif ++} ++ ++static inline void ++prepare_lock_switch(struct rq *rq, struct task_struct *next) ++{ ++ /* ++ * Since the runqueue lock will be released by the next ++ * task (which is an invalid locking op but in the case ++ * of the scheduler it's an obvious special-case), so we ++ * do an early lockdep release here: ++ */ ++ spin_release(&rq->lock->dep_map, _THIS_IP_); ++#ifdef CONFIG_DEBUG_SPINLOCK ++ /* this is a valid case when another task releases the spinlock */ ++ rq->lock->owner = next; ++#endif ++} ++ ++static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev) ++{ ++ /* ++ * If we are tracking spinlock dependencies then we have to ++ * fix up the runqueue lock - which gets 'carried over' from ++ * prev into current: ++ */ ++ spin_acquire(&rq->lock->dep_map, 0, 0, _THIS_IP_); ++ ++#ifdef CONFIG_SMP ++ /* ++ * If prev was marked as migrating to another CPU in return_task, drop ++ * the local runqueue lock but leave interrupts disabled and grab the ++ * remote lock we're migrating it to before enabling them. ++ */ ++ if (unlikely(task_on_rq_migrating(prev))) { ++ sched_info_dequeued(rq, prev); ++ /* ++ * We move the ownership of prev to the new cpu now. ttwu can't ++ * activate prev to the wrong cpu since it has to grab this ++ * runqueue in ttwu_remote. ++ */ ++#ifdef CONFIG_THREAD_INFO_IN_TASK ++ prev->cpu = prev->wake_cpu; ++#else ++ task_thread_info(prev)->cpu = prev->wake_cpu; ++#endif ++ raw_spin_unlock(rq->lock); ++ ++ raw_spin_lock(&prev->pi_lock); ++ rq = __task_rq_lock(prev, NULL); ++ /* Check that someone else hasn't already queued prev */ ++ if (likely(!task_queued(prev))) { ++ enqueue_task(rq, prev, 0); ++ prev->on_rq = TASK_ON_RQ_QUEUED; ++ /* Wake up the CPU if it's not already running */ ++ resched_if_idle(rq); ++ } ++ raw_spin_unlock(&prev->pi_lock); ++ } ++#endif ++ rq_unlock(rq); ++ ++ do_pending_softirq(rq, current); ++ ++ local_irq_enable(); ++} ++ ++#ifndef prepare_arch_switch ++# define prepare_arch_switch(next) do { } while (0) ++#endif ++#ifndef finish_arch_switch ++# define finish_arch_switch(prev) do { } while (0) ++#endif ++#ifndef finish_arch_post_lock_switch ++# define finish_arch_post_lock_switch() do { } while (0) ++#endif ++ ++/** ++ * prepare_task_switch - prepare to switch tasks ++ * @rq: the runqueue preparing to switch ++ * @next: the task we are going to switch to. ++ * ++ * This is called with the rq lock held and interrupts off. It must ++ * be paired with a subsequent finish_task_switch after the context ++ * switch. ++ * ++ * prepare_task_switch sets up locking and calls architecture specific ++ * hooks. ++ */ ++static inline void ++prepare_task_switch(struct rq *rq, struct task_struct *prev, ++ struct task_struct *next) ++{ ++ kcov_prepare_switch(prev); ++ sched_info_switch(rq, prev, next); ++ perf_event_task_sched_out(prev, next); ++ rseq_preempt(prev); ++ fire_sched_out_preempt_notifiers(prev, next); ++ prepare_task(next); ++ prepare_arch_switch(next); ++} ++ ++/** ++ * finish_task_switch - clean up after a task-switch ++ * @rq: runqueue associated with task-switch ++ * @prev: the thread we just switched away from. ++ * ++ * finish_task_switch must be called after the context switch, paired ++ * with a prepare_task_switch call before the context switch. ++ * finish_task_switch will reconcile locking set up by prepare_task_switch, ++ * and do any other architecture-specific cleanup actions. ++ * ++ * Note that we may have delayed dropping an mm in context_switch(). If ++ * so, we finish that here outside of the runqueue lock. (Doing it ++ * with the lock held can cause deadlocks; see schedule() for ++ * details.) ++ * ++ * The context switch have flipped the stack from under us and restored the ++ * local variables which were saved when this task called schedule() in the ++ * past. prev == current is still correct but we need to recalculate this_rq ++ * because prev may have moved to another CPU. ++ */ ++static void finish_task_switch(struct task_struct *prev) ++ __releases(rq->lock) ++{ ++ struct rq *rq = this_rq(); ++ struct mm_struct *mm = rq->prev_mm; ++ long prev_state; ++ ++ /* ++ * The previous task will have left us with a preempt_count of 2 ++ * because it left us after: ++ * ++ * schedule() ++ * preempt_disable(); // 1 ++ * __schedule() ++ * raw_spin_lock_irq(rq->lock) // 2 ++ * ++ * Also, see FORK_PREEMPT_COUNT. ++ */ ++ if (WARN_ONCE(preempt_count() != 2*PREEMPT_DISABLE_OFFSET, ++ "corrupted preempt_count: %s/%d/0x%x\n", ++ current->comm, current->pid, preempt_count())) ++ preempt_count_set(FORK_PREEMPT_COUNT); ++ ++ rq->prev_mm = NULL; ++ ++ /* ++ * A task struct has one reference for the use as "current". ++ * If a task dies, then it sets TASK_DEAD in tsk->state and calls ++ * schedule one last time. The schedule call will never return, and ++ * the scheduled task must drop that reference. ++ * ++ * We must observe prev->state before clearing prev->on_cpu (in ++ * finish_task), otherwise a concurrent wakeup can get prev ++ * running on another CPU and we could rave with its RUNNING -> DEAD ++ * transition, resulting in a double drop. ++ */ ++ prev_state = prev->state; ++ vtime_task_switch(prev); ++ perf_event_task_sched_in(prev, current); ++ finish_task(prev); ++ finish_lock_switch(rq, prev); ++ finish_arch_post_lock_switch(); ++ kcov_finish_switch(current); ++ ++ fire_sched_in_preempt_notifiers(current); ++ /* ++ * When switching through a kernel thread, the loop in ++ * membarrier_{private,global}_expedited() may have observed that ++ * kernel thread and not issued an IPI. It is therefore possible to ++ * schedule between user->kernel->user threads without passing though ++ * switch_mm(). Membarrier requires a barrier after storing to ++ * rq->curr, before returning to userspace, so provide them here: ++ * ++ * - a full memory barrier for {PRIVATE,GLOBAL}_EXPEDITED, implicitly ++ * provided by mmdrop(), ++ * - a sync_core for SYNC_CORE. ++ */ ++ if (mm) { ++ membarrier_mm_sync_core_before_usermode(mm); ++ mmdrop(mm); ++ } ++ if (unlikely(prev_state == TASK_DEAD)) { ++ /* ++ * Remove function-return probe instances associated with this ++ * task and put them back on the free list. ++ */ ++ kprobe_flush_task(prev); ++ ++ /* Task is done with its stack. */ ++ put_task_stack(prev); ++ ++ put_task_struct_rcu_user(prev); ++ } ++} ++ ++/** ++ * schedule_tail - first thing a freshly forked thread must call. ++ * @prev: the thread we just switched away from. ++ */ ++asmlinkage __visible void schedule_tail(struct task_struct *prev) ++{ ++ /* ++ * New tasks start with FORK_PREEMPT_COUNT, see there and ++ * finish_task_switch() for details. ++ * ++ * finish_task_switch() will drop rq->lock() and lower preempt_count ++ * and the preempt_enable() will end up enabling preemption (on ++ * PREEMPT_COUNT kernels). ++ */ ++ ++ finish_task_switch(prev); ++ preempt_enable(); ++ ++ if (current->set_child_tid) ++ put_user(task_pid_vnr(current), current->set_child_tid); ++ ++ calculate_sigpending(); ++} ++ ++/* ++ * context_switch - switch to the new MM and the new thread's register state. ++ */ ++static __always_inline void ++context_switch(struct rq *rq, struct task_struct *prev, ++ struct task_struct *next) ++{ ++ prepare_task_switch(rq, prev, next); ++ ++ /* ++ * For paravirt, this is coupled with an exit in switch_to to ++ * combine the page table reload and the switch backend into ++ * one hypercall. ++ */ ++ arch_start_context_switch(prev); ++ ++ /* ++ * kernel -> kernel lazy + transfer active ++ * user -> kernel lazy + mmgrab() active ++ * ++ * kernel -> user switch + mmdrop() active ++ * user -> user switch ++ */ ++ if (!next->mm) { // to kernel ++ enter_lazy_tlb(prev->active_mm, next); ++ ++ next->active_mm = prev->active_mm; ++ if (prev->mm) // from user ++ mmgrab(prev->active_mm); ++ else ++ prev->active_mm = NULL; ++ } else { // to user ++ membarrier_switch_mm(rq, prev->active_mm, next->mm); ++ /* ++ * sys_membarrier() requires an smp_mb() between setting ++ * rq->curr / membarrier_switch_mm() and returning to userspace. ++ * ++ * The below provides this either through switch_mm(), or in ++ * case 'prev->active_mm == next->mm' through ++ * finish_task_switch()'s mmdrop(). ++ */ ++ switch_mm_irqs_off(prev->active_mm, next->mm, next); ++ ++ if (!prev->mm) { // from kernel ++ /* will mmdrop() in finish_task_switch(). */ ++ rq->prev_mm = prev->active_mm; ++ prev->active_mm = NULL; ++ } ++ } ++ prepare_lock_switch(rq, next); ++ ++ /* Here we just switch the register state and the stack. */ ++ switch_to(prev, next, prev); ++ barrier(); ++ ++ finish_task_switch(prev); ++} ++ ++/* ++ * nr_running, nr_uninterruptible and nr_context_switches: ++ * ++ * externally visible scheduler statistics: current number of runnable ++ * threads, total number of context switches performed since bootup. ++ */ ++unsigned long nr_running(void) ++{ ++ unsigned long i, sum = 0; ++ ++ for_each_online_cpu(i) ++ sum += cpu_rq(i)->nr_running; ++ ++ return sum; ++} ++ ++static unsigned long nr_uninterruptible(void) ++{ ++ unsigned long i, sum = 0; ++ ++ for_each_online_cpu(i) ++ sum += cpu_rq(i)->nr_uninterruptible; ++ ++ return sum; ++} ++ ++/* ++ * Check if only the current task is running on the CPU. ++ * ++ * Caution: this function does not check that the caller has disabled ++ * preemption, thus the result might have a time-of-check-to-time-of-use ++ * race. The caller is responsible to use it correctly, for example: ++ * ++ * - from a non-preemptible section (of course) ++ * ++ * - from a thread that is bound to a single CPU ++ * ++ * - in a loop with very short iterations (e.g. a polling loop) ++ */ ++bool single_task_running(void) ++{ ++ if (rq_load(raw_rq()) == 1) ++ return true; ++ else ++ return false; ++} ++EXPORT_SYMBOL(single_task_running); ++ ++unsigned long long nr_context_switches(void) ++{ ++ int cpu; ++ unsigned long long sum = 0; ++ ++ for_each_possible_cpu(cpu) ++ sum += cpu_rq(cpu)->nr_switches; ++ ++ return sum; ++} ++ ++/* ++ * Consumers of these two interfaces, like for example the cpufreq menu ++ * governor are using nonsensical data. Boosting frequency for a CPU that has ++ * IO-wait which might not even end up running the task when it does become ++ * runnable. ++ */ ++ ++unsigned long nr_iowait_cpu(int cpu) ++{ ++ return atomic_read(&cpu_rq(cpu)->nr_iowait); ++} ++ ++/* ++ * IO-wait accounting, and how its mostly bollocks (on SMP). ++ * ++ * The idea behind IO-wait account is to account the idle time that we could ++ * have spend running if it were not for IO. That is, if we were to improve the ++ * storage performance, we'd have a proportional reduction in IO-wait time. ++ * ++ * This all works nicely on UP, where, when a task blocks on IO, we account ++ * idle time as IO-wait, because if the storage were faster, it could've been ++ * running and we'd not be idle. ++ * ++ * This has been extended to SMP, by doing the same for each CPU. This however ++ * is broken. ++ * ++ * Imagine for instance the case where two tasks block on one CPU, only the one ++ * CPU will have IO-wait accounted, while the other has regular idle. Even ++ * though, if the storage were faster, both could've ran at the same time, ++ * utilising both CPUs. ++ * ++ * This means, that when looking globally, the current IO-wait accounting on ++ * SMP is a lower bound, by reason of under accounting. ++ * ++ * Worse, since the numbers are provided per CPU, they are sometimes ++ * interpreted per CPU, and that is nonsensical. A blocked task isn't strictly ++ * associated with any one particular CPU, it can wake to another CPU than it ++ * blocked on. This means the per CPU IO-wait number is meaningless. ++ * ++ * Task CPU affinities can make all that even more 'interesting'. ++ */ ++ ++unsigned long nr_iowait(void) ++{ ++ unsigned long cpu, sum = 0; ++ ++ for_each_possible_cpu(cpu) ++ sum += nr_iowait_cpu(cpu); ++ ++ return sum; ++} ++ ++unsigned long nr_active(void) ++{ ++ return nr_running() + nr_uninterruptible(); ++} ++ ++/* Variables and functions for calc_load */ ++static unsigned long calc_load_update; ++unsigned long avenrun[3]; ++EXPORT_SYMBOL(avenrun); ++ ++/** ++ * get_avenrun - get the load average array ++ * @loads: pointer to dest load array ++ * @offset: offset to add ++ * @shift: shift count to shift the result left ++ * ++ * These values are estimates at best, so no need for locking. ++ */ ++void get_avenrun(unsigned long *loads, unsigned long offset, int shift) ++{ ++ loads[0] = (avenrun[0] + offset) << shift; ++ loads[1] = (avenrun[1] + offset) << shift; ++ loads[2] = (avenrun[2] + offset) << shift; ++} ++ ++/* ++ * calc_load - update the avenrun load estimates every LOAD_FREQ seconds. ++ */ ++void calc_global_load(unsigned long ticks) ++{ ++ long active; ++ ++ if (time_before(jiffies, READ_ONCE(calc_load_update))) ++ return; ++ active = nr_active() * FIXED_1; ++ ++ avenrun[0] = calc_load(avenrun[0], EXP_1, active); ++ avenrun[1] = calc_load(avenrun[1], EXP_5, active); ++ avenrun[2] = calc_load(avenrun[2], EXP_15, active); ++ ++ calc_load_update = jiffies + LOAD_FREQ; ++} ++ ++/** ++ * fixed_power_int - compute: x^n, in O(log n) time ++ * ++ * @x: base of the power ++ * @frac_bits: fractional bits of @x ++ * @n: power to raise @x to. ++ * ++ * By exploiting the relation between the definition of the natural power ++ * function: x^n := x*x*...*x (x multiplied by itself for n times), and ++ * the binary encoding of numbers used by computers: n := \Sum n_i * 2^i, ++ * (where: n_i \elem {0, 1}, the binary vector representing n), ++ * we find: x^n := x^(\Sum n_i * 2^i) := \Prod x^(n_i * 2^i), which is ++ * of course trivially computable in O(log_2 n), the length of our binary ++ * vector. ++ */ ++static unsigned long ++fixed_power_int(unsigned long x, unsigned int frac_bits, unsigned int n) ++{ ++ unsigned long result = 1UL << frac_bits; ++ ++ if (n) { ++ for (;;) { ++ if (n & 1) { ++ result *= x; ++ result += 1UL << (frac_bits - 1); ++ result >>= frac_bits; ++ } ++ n >>= 1; ++ if (!n) ++ break; ++ x *= x; ++ x += 1UL << (frac_bits - 1); ++ x >>= frac_bits; ++ } ++ } ++ ++ return result; ++} ++ ++/* ++ * a1 = a0 * e + a * (1 - e) ++ * ++ * a2 = a1 * e + a * (1 - e) ++ * = (a0 * e + a * (1 - e)) * e + a * (1 - e) ++ * = a0 * e^2 + a * (1 - e) * (1 + e) ++ * ++ * a3 = a2 * e + a * (1 - e) ++ * = (a0 * e^2 + a * (1 - e) * (1 + e)) * e + a * (1 - e) ++ * = a0 * e^3 + a * (1 - e) * (1 + e + e^2) ++ * ++ * ... ++ * ++ * an = a0 * e^n + a * (1 - e) * (1 + e + ... + e^n-1) [1] ++ * = a0 * e^n + a * (1 - e) * (1 - e^n)/(1 - e) ++ * = a0 * e^n + a * (1 - e^n) ++ * ++ * [1] application of the geometric series: ++ * ++ * n 1 - x^(n+1) ++ * S_n := \Sum x^i = ------------- ++ * i=0 1 - x ++ */ ++unsigned long ++calc_load_n(unsigned long load, unsigned long exp, ++ unsigned long active, unsigned int n) ++{ ++ return calc_load(load, fixed_power_int(exp, FSHIFT, n), active); ++} ++ ++DEFINE_PER_CPU(struct kernel_stat, kstat); ++DEFINE_PER_CPU(struct kernel_cpustat, kernel_cpustat); ++ ++EXPORT_PER_CPU_SYMBOL(kstat); ++EXPORT_PER_CPU_SYMBOL(kernel_cpustat); ++ ++#ifdef CONFIG_PARAVIRT ++static inline u64 steal_ticks(u64 steal) ++{ ++ if (unlikely(steal > NSEC_PER_SEC)) ++ return div_u64(steal, TICK_NSEC); ++ ++ return __iter_div_u64_rem(steal, TICK_NSEC, &steal); ++} ++#endif ++ ++#ifndef nsecs_to_cputime ++# define nsecs_to_cputime(__nsecs) nsecs_to_jiffies(__nsecs) ++#endif ++ ++/* ++ * On each tick, add the number of nanoseconds to the unbanked variables and ++ * once one tick's worth has accumulated, account it allowing for accurate ++ * sub-tick accounting and totals. Use the TICK_APPROX_NS to match the way we ++ * deduct nanoseconds. ++ */ ++static void pc_idle_time(struct rq *rq, struct task_struct *idle, unsigned long ns) ++{ ++ u64 *cpustat = kcpustat_this_cpu->cpustat; ++ unsigned long ticks; ++ ++ if (atomic_read(&rq->nr_iowait) > 0) { ++ rq->iowait_ns += ns; ++ if (rq->iowait_ns >= JIFFY_NS) { ++ ticks = NS_TO_JIFFIES(rq->iowait_ns); ++ cpustat[CPUTIME_IOWAIT] += (__force u64)TICK_APPROX_NS * ticks; ++ rq->iowait_ns %= JIFFY_NS; ++ } ++ } else { ++ rq->idle_ns += ns; ++ if (rq->idle_ns >= JIFFY_NS) { ++ ticks = NS_TO_JIFFIES(rq->idle_ns); ++ cpustat[CPUTIME_IDLE] += (__force u64)TICK_APPROX_NS * ticks; ++ rq->idle_ns %= JIFFY_NS; ++ } ++ } ++ acct_update_integrals(idle); ++} ++ ++static void pc_system_time(struct rq *rq, struct task_struct *p, ++ int hardirq_offset, unsigned long ns) ++{ ++ u64 *cpustat = kcpustat_this_cpu->cpustat; ++ unsigned long ticks; ++ ++ p->stime_ns += ns; ++ if (p->stime_ns >= JIFFY_NS) { ++ ticks = NS_TO_JIFFIES(p->stime_ns); ++ p->stime_ns %= JIFFY_NS; ++ p->stime += (__force u64)TICK_APPROX_NS * ticks; ++ account_group_system_time(p, TICK_APPROX_NS * ticks); ++ } ++ p->sched_time += ns; ++ account_group_exec_runtime(p, ns); ++ ++ if (hardirq_count() - hardirq_offset) { ++ rq->irq_ns += ns; ++ if (rq->irq_ns >= JIFFY_NS) { ++ ticks = NS_TO_JIFFIES(rq->irq_ns); ++ cpustat[CPUTIME_IRQ] += (__force u64)TICK_APPROX_NS * ticks; ++ rq->irq_ns %= JIFFY_NS; ++ } ++ } else if (in_serving_softirq()) { ++ rq->softirq_ns += ns; ++ if (rq->softirq_ns >= JIFFY_NS) { ++ ticks = NS_TO_JIFFIES(rq->softirq_ns); ++ cpustat[CPUTIME_SOFTIRQ] += (__force u64)TICK_APPROX_NS * ticks; ++ rq->softirq_ns %= JIFFY_NS; ++ } ++ } else { ++ rq->system_ns += ns; ++ if (rq->system_ns >= JIFFY_NS) { ++ ticks = NS_TO_JIFFIES(rq->system_ns); ++ cpustat[CPUTIME_SYSTEM] += (__force u64)TICK_APPROX_NS * ticks; ++ rq->system_ns %= JIFFY_NS; ++ } ++ } ++ acct_update_integrals(p); ++} ++ ++static void pc_user_time(struct rq *rq, struct task_struct *p, unsigned long ns) ++{ ++ u64 *cpustat = kcpustat_this_cpu->cpustat; ++ unsigned long ticks; ++ ++ p->utime_ns += ns; ++ if (p->utime_ns >= JIFFY_NS) { ++ ticks = NS_TO_JIFFIES(p->utime_ns); ++ p->utime_ns %= JIFFY_NS; ++ p->utime += (__force u64)TICK_APPROX_NS * ticks; ++ account_group_user_time(p, TICK_APPROX_NS * ticks); ++ } ++ p->sched_time += ns; ++ account_group_exec_runtime(p, ns); ++ ++ if (this_cpu_ksoftirqd() == p) { ++ /* ++ * ksoftirqd time do not get accounted in cpu_softirq_time. ++ * So, we have to handle it separately here. ++ */ ++ rq->softirq_ns += ns; ++ if (rq->softirq_ns >= JIFFY_NS) { ++ ticks = NS_TO_JIFFIES(rq->softirq_ns); ++ cpustat[CPUTIME_SOFTIRQ] += (__force u64)TICK_APPROX_NS * ticks; ++ rq->softirq_ns %= JIFFY_NS; ++ } ++ } ++ ++ if (task_nice(p) > 0 || idleprio_task(p)) { ++ rq->nice_ns += ns; ++ if (rq->nice_ns >= JIFFY_NS) { ++ ticks = NS_TO_JIFFIES(rq->nice_ns); ++ cpustat[CPUTIME_NICE] += (__force u64)TICK_APPROX_NS * ticks; ++ rq->nice_ns %= JIFFY_NS; ++ } ++ } else { ++ rq->user_ns += ns; ++ if (rq->user_ns >= JIFFY_NS) { ++ ticks = NS_TO_JIFFIES(rq->user_ns); ++ cpustat[CPUTIME_USER] += (__force u64)TICK_APPROX_NS * ticks; ++ rq->user_ns %= JIFFY_NS; ++ } ++ } ++ acct_update_integrals(p); ++} ++ ++/* ++ * This is called on clock ticks. ++ * Bank in p->sched_time the ns elapsed since the last tick or switch. ++ * CPU scheduler quota accounting is also performed here in microseconds. ++ */ ++static void update_cpu_clock_tick(struct rq *rq, struct task_struct *p) ++{ ++ s64 account_ns = rq->niffies - p->last_ran; ++ struct task_struct *idle = rq->idle; ++ ++ /* Accurate tick timekeeping */ ++ if (user_mode(get_irq_regs())) ++ pc_user_time(rq, p, account_ns); ++ else if (p != idle || (irq_count() != HARDIRQ_OFFSET)) { ++ pc_system_time(rq, p, HARDIRQ_OFFSET, account_ns); ++ } else ++ pc_idle_time(rq, idle, account_ns); ++ ++ /* time_slice accounting is done in usecs to avoid overflow on 32bit */ ++ if (p->policy != SCHED_FIFO && p != idle) ++ p->time_slice -= NS_TO_US(account_ns); ++ ++ p->last_ran = rq->niffies; ++} ++ ++/* ++ * This is called on context switches. ++ * Bank in p->sched_time the ns elapsed since the last tick or switch. ++ * CPU scheduler quota accounting is also performed here in microseconds. ++ */ ++static void update_cpu_clock_switch(struct rq *rq, struct task_struct *p) ++{ ++ s64 account_ns = rq->niffies - p->last_ran; ++ struct task_struct *idle = rq->idle; ++ ++ /* Accurate subtick timekeeping */ ++ if (p != idle) ++ pc_user_time(rq, p, account_ns); ++ else ++ pc_idle_time(rq, idle, account_ns); ++ ++ /* time_slice accounting is done in usecs to avoid overflow on 32bit */ ++ if (p->policy != SCHED_FIFO && p != idle) ++ p->time_slice -= NS_TO_US(account_ns); ++} ++ ++/* ++ * Return any ns on the sched_clock that have not yet been accounted in ++ * @p in case that task is currently running. ++ * ++ * Called with task_rq_lock(p) held. ++ */ ++static inline u64 do_task_delta_exec(struct task_struct *p, struct rq *rq) ++{ ++ u64 ns = 0; ++ ++ /* ++ * Must be ->curr _and_ ->on_rq. If dequeued, we would ++ * project cycles that may never be accounted to this ++ * thread, breaking clock_gettime(). ++ */ ++ if (p == rq->curr && task_on_rq_queued(p)) { ++ update_clocks(rq); ++ ns = rq->niffies - p->last_ran; ++ } ++ ++ return ns; ++} ++ ++/* ++ * Return accounted runtime for the task. ++ * Return separately the current's pending runtime that have not been ++ * accounted yet. ++ * ++ */ ++unsigned long long task_sched_runtime(struct task_struct *p) ++{ ++ struct rq_flags rf; ++ struct rq *rq; ++ u64 ns; ++ ++#if defined(CONFIG_64BIT) && defined(CONFIG_SMP) ++ /* ++ * 64-bit doesn't need locks to atomically read a 64-bit value. ++ * So we have a optimisation chance when the task's delta_exec is 0. ++ * Reading ->on_cpu is racy, but this is ok. ++ * ++ * If we race with it leaving CPU, we'll take a lock. So we're correct. ++ * If we race with it entering CPU, unaccounted time is 0. This is ++ * indistinguishable from the read occurring a few cycles earlier. ++ * If we see ->on_cpu without ->on_rq, the task is leaving, and has ++ * been accounted, so we're correct here as well. ++ */ ++ if (!p->on_cpu || !task_on_rq_queued(p)) ++ return tsk_seruntime(p); ++#endif ++ ++ rq = task_rq_lock(p, &rf); ++ ns = p->sched_time + do_task_delta_exec(p, rq); ++ task_rq_unlock(rq, p, &rf); ++ ++ return ns; ++} ++ ++/* ++ * Functions to test for when SCHED_ISO tasks have used their allocated ++ * quota as real time scheduling and convert them back to SCHED_NORMAL. All ++ * data is modified only by the local runqueue during scheduler_tick with ++ * interrupts disabled. ++ */ ++ ++/* ++ * Test if SCHED_ISO tasks have run longer than their alloted period as RT ++ * tasks and set the refractory flag if necessary. There is 10% hysteresis ++ * for unsetting the flag. 115/128 is ~90/100 as a fast shift instead of a ++ * slow division. ++ */ ++static inline void iso_tick(struct rq *rq) ++{ ++ rq->iso_ticks = rq->iso_ticks * (ISO_PERIOD - 1) / ISO_PERIOD; ++ rq->iso_ticks += 100; ++ if (rq->iso_ticks > ISO_PERIOD * sched_iso_cpu) { ++ rq->iso_refractory = true; ++ if (unlikely(rq->iso_ticks > ISO_PERIOD * 100)) ++ rq->iso_ticks = ISO_PERIOD * 100; ++ } ++} ++ ++/* No SCHED_ISO task was running so decrease rq->iso_ticks */ ++static inline void no_iso_tick(struct rq *rq, int ticks) ++{ ++ if (rq->iso_ticks > 0 || rq->iso_refractory) { ++ rq->iso_ticks = rq->iso_ticks * (ISO_PERIOD - ticks) / ISO_PERIOD; ++ if (rq->iso_ticks < ISO_PERIOD * (sched_iso_cpu * 115 / 128)) { ++ rq->iso_refractory = false; ++ if (unlikely(rq->iso_ticks < 0)) ++ rq->iso_ticks = 0; ++ } ++ } ++} ++ ++/* This manages tasks that have run out of timeslice during a scheduler_tick */ ++static void task_running_tick(struct rq *rq) ++{ ++ struct task_struct *p = rq->curr; ++ ++ /* ++ * If a SCHED_ISO task is running we increment the iso_ticks. In ++ * order to prevent SCHED_ISO tasks from causing starvation in the ++ * presence of true RT tasks we account those as iso_ticks as well. ++ */ ++ if (rt_task(p) || task_running_iso(p)) ++ iso_tick(rq); ++ else ++ no_iso_tick(rq, 1); ++ ++ /* SCHED_FIFO tasks never run out of timeslice. */ ++ if (p->policy == SCHED_FIFO) ++ return; ++ ++ if (iso_task(p)) { ++ if (task_running_iso(p)) { ++ if (rq->iso_refractory) { ++ /* ++ * SCHED_ISO task is running as RT and limit ++ * has been hit. Force it to reschedule as ++ * SCHED_NORMAL by zeroing its time_slice ++ */ ++ p->time_slice = 0; ++ } ++ } else if (!rq->iso_refractory) { ++ /* Can now run again ISO. Reschedule to pick up prio */ ++ goto out_resched; ++ } ++ } ++ ++ /* ++ * Tasks that were scheduled in the first half of a tick are not ++ * allowed to run into the 2nd half of the next tick if they will ++ * run out of time slice in the interim. Otherwise, if they have ++ * less than RESCHED_US μs of time slice left they will be rescheduled. ++ * Dither is used as a backup for when hrexpiry is disabled or high res ++ * timers not configured in. ++ */ ++ if (p->time_slice - rq->dither >= RESCHED_US) ++ return; ++out_resched: ++ rq_lock(rq); ++ __set_tsk_resched(p); ++ rq_unlock(rq); ++} ++ ++static inline void task_tick(struct rq *rq) ++{ ++ if (!rq_idle(rq)) ++ task_running_tick(rq); ++ else if (rq->last_jiffy > rq->last_scheduler_tick) ++ no_iso_tick(rq, rq->last_jiffy - rq->last_scheduler_tick); ++} ++ ++#ifdef CONFIG_NO_HZ_FULL ++/* ++ * We can stop the timer tick any time highres timers are active since ++ * we rely entirely on highres timeouts for task expiry rescheduling. ++ */ ++static void sched_stop_tick(struct rq *rq, int cpu) ++{ ++ if (!hrexpiry_enabled(rq)) ++ return; ++ if (!tick_nohz_full_enabled()) ++ return; ++ if (!tick_nohz_full_cpu(cpu)) ++ return; ++ tick_nohz_dep_clear_cpu(cpu, TICK_DEP_BIT_SCHED); ++} ++ ++static inline void sched_start_tick(struct rq *rq, int cpu) ++{ ++ tick_nohz_dep_set_cpu(cpu, TICK_DEP_BIT_SCHED); ++} ++ ++struct tick_work { ++ int cpu; ++ atomic_t state; ++ struct delayed_work work; ++}; ++/* Values for ->state, see diagram below. */ ++#define TICK_SCHED_REMOTE_OFFLINE 0 ++#define TICK_SCHED_REMOTE_OFFLINING 1 ++#define TICK_SCHED_REMOTE_RUNNING 2 ++ ++/* ++ * State diagram for ->state: ++ * ++ * ++ * TICK_SCHED_REMOTE_OFFLINE ++ * | ^ ++ * | | ++ * | | sched_tick_remote() ++ * | | ++ * | | ++ * +--TICK_SCHED_REMOTE_OFFLINING ++ * | ^ ++ * | | ++ * sched_tick_start() | | sched_tick_stop() ++ * | | ++ * V | ++ * TICK_SCHED_REMOTE_RUNNING ++ * ++ * ++ * Other transitions get WARN_ON_ONCE(), except that sched_tick_remote() ++ * and sched_tick_start() are happy to leave the state in RUNNING. ++ */ ++ ++static struct tick_work __percpu *tick_work_cpu; ++ ++static void sched_tick_remote(struct work_struct *work) ++{ ++ struct delayed_work *dwork = to_delayed_work(work); ++ struct tick_work *twork = container_of(dwork, struct tick_work, work); ++ int cpu = twork->cpu; ++ struct rq *rq = cpu_rq(cpu); ++ struct task_struct *curr; ++ u64 delta; ++ int os; ++ ++ /* ++ * Handle the tick only if it appears the remote CPU is running in full ++ * dynticks mode. The check is racy by nature, but missing a tick or ++ * having one too much is no big deal because the scheduler tick updates ++ * statistics and checks timeslices in a time-independent way, regardless ++ * of when exactly it is running. ++ */ ++ if (idle_cpu(cpu) || !tick_nohz_tick_stopped_cpu(cpu)) ++ goto out_requeue; ++ ++ rq_lock_irq(rq); ++ curr = rq->curr; ++ if (is_idle_task(curr) || cpu_is_offline(cpu)) ++ goto out_unlock; ++ ++ update_rq_clock(rq); ++ delta = rq_clock_task(rq) - curr->last_ran; ++ ++ /* ++ * Make sure the next tick runs within a reasonable ++ * amount of time. ++ */ ++ WARN_ON_ONCE(delta > (u64)NSEC_PER_SEC * 3); ++ task_tick(rq); ++ ++out_unlock: ++ rq_unlock_irq(rq, NULL); ++ ++out_requeue: ++ /* ++ * Run the remote tick once per second (1Hz). This arbitrary ++ * frequency is large enough to avoid overload but short enough ++ * to keep scheduler internal stats reasonably up to date. But ++ * first update state to reflect hotplug activity if required. ++ */ ++ os = atomic_fetch_add_unless(&twork->state, -1, TICK_SCHED_REMOTE_RUNNING); ++ WARN_ON_ONCE(os == TICK_SCHED_REMOTE_OFFLINE); ++ if (os == TICK_SCHED_REMOTE_RUNNING) ++ queue_delayed_work(system_unbound_wq, dwork, HZ); ++} ++ ++static void sched_tick_start(int cpu) ++{ ++ struct tick_work *twork; ++ int os; ++ ++ if (housekeeping_cpu(cpu, HK_FLAG_TICK)) ++ return; ++ ++ WARN_ON_ONCE(!tick_work_cpu); ++ ++ twork = per_cpu_ptr(tick_work_cpu, cpu); ++ os = atomic_xchg(&twork->state, TICK_SCHED_REMOTE_RUNNING); ++ WARN_ON_ONCE(os == TICK_SCHED_REMOTE_RUNNING); ++ if (os == TICK_SCHED_REMOTE_OFFLINE) { ++ twork->cpu = cpu; ++ INIT_DELAYED_WORK(&twork->work, sched_tick_remote); ++ queue_delayed_work(system_unbound_wq, &twork->work, HZ); ++ } ++} ++ ++#ifdef CONFIG_HOTPLUG_CPU ++static void sched_tick_stop(int cpu) ++{ ++ struct tick_work *twork; ++ int os; ++ ++ if (housekeeping_cpu(cpu, HK_FLAG_TICK)) ++ return; ++ ++ WARN_ON_ONCE(!tick_work_cpu); ++ ++ twork = per_cpu_ptr(tick_work_cpu, cpu); ++ /* There cannot be competing actions, but don't rely on stop-machine. */ ++ os = atomic_xchg(&twork->state, TICK_SCHED_REMOTE_OFFLINING); ++ WARN_ON_ONCE(os != TICK_SCHED_REMOTE_RUNNING); ++ /* Don't cancel, as this would mess up the state machine. */ ++} ++#endif /* CONFIG_HOTPLUG_CPU */ ++ ++int __init sched_tick_offload_init(void) ++{ ++ tick_work_cpu = alloc_percpu(struct tick_work); ++ BUG_ON(!tick_work_cpu); ++ return 0; ++} ++ ++#else /* !CONFIG_NO_HZ_FULL */ ++static inline void sched_stop_tick(struct rq *rq, int cpu) {} ++static inline void sched_start_tick(struct rq *rq, int cpu) {} ++static inline void sched_tick_start(int cpu) { } ++static inline void sched_tick_stop(int cpu) { } ++#endif ++ ++/* ++ * This function gets called by the timer code, with HZ frequency. ++ * We call it with interrupts disabled. ++ */ ++void scheduler_tick(void) ++{ ++ int cpu __maybe_unused = smp_processor_id(); ++ struct rq *rq = cpu_rq(cpu); ++ ++ sched_clock_tick(); ++ update_clocks(rq); ++ update_load_avg(rq, 0); ++ update_cpu_clock_tick(rq, rq->curr); ++ task_tick(rq); ++ rq->last_scheduler_tick = rq->last_jiffy; ++ rq->last_tick = rq->clock; ++ psi_task_tick(rq); ++ perf_event_task_tick(); ++ sched_stop_tick(rq, cpu); ++} ++ ++#if defined(CONFIG_PREEMPTION) && (defined(CONFIG_DEBUG_PREEMPT) || \ ++ defined(CONFIG_TRACE_PREEMPT_TOGGLE)) ++/* ++ * If the value passed in is equal to the current preempt count ++ * then we just disabled preemption. Start timing the latency. ++ */ ++static inline void preempt_latency_start(int val) ++{ ++ if (preempt_count() == val) { ++ unsigned long ip = get_lock_parent_ip(); ++#ifdef CONFIG_DEBUG_PREEMPT ++ current->preempt_disable_ip = ip; ++#endif ++ trace_preempt_off(CALLER_ADDR0, ip); ++ } ++} ++ ++void preempt_count_add(int val) ++{ ++#ifdef CONFIG_DEBUG_PREEMPT ++ /* ++ * Underflow? ++ */ ++ if (DEBUG_LOCKS_WARN_ON((preempt_count() < 0))) ++ return; ++#endif ++ __preempt_count_add(val); ++#ifdef CONFIG_DEBUG_PREEMPT ++ /* ++ * Spinlock count overflowing soon? ++ */ ++ DEBUG_LOCKS_WARN_ON((preempt_count() & PREEMPT_MASK) >= ++ PREEMPT_MASK - 10); ++#endif ++ preempt_latency_start(val); ++} ++EXPORT_SYMBOL(preempt_count_add); ++NOKPROBE_SYMBOL(preempt_count_add); ++ ++/* ++ * If the value passed in equals to the current preempt count ++ * then we just enabled preemption. Stop timing the latency. ++ */ ++static inline void preempt_latency_stop(int val) ++{ ++ if (preempt_count() == val) ++ trace_preempt_on(CALLER_ADDR0, get_lock_parent_ip()); ++} ++ ++void preempt_count_sub(int val) ++{ ++#ifdef CONFIG_DEBUG_PREEMPT ++ /* ++ * Underflow? ++ */ ++ if (DEBUG_LOCKS_WARN_ON(val > preempt_count())) ++ return; ++ /* ++ * Is the spinlock portion underflowing? ++ */ ++ if (DEBUG_LOCKS_WARN_ON((val < PREEMPT_MASK) && ++ !(preempt_count() & PREEMPT_MASK))) ++ return; ++#endif ++ ++ preempt_latency_stop(val); ++ __preempt_count_sub(val); ++} ++EXPORT_SYMBOL(preempt_count_sub); ++NOKPROBE_SYMBOL(preempt_count_sub); ++ ++#else ++static inline void preempt_latency_start(int val) { } ++static inline void preempt_latency_stop(int val) { } ++#endif ++ ++static inline unsigned long get_preempt_disable_ip(struct task_struct *p) ++{ ++#ifdef CONFIG_DEBUG_PREEMPT ++ return p->preempt_disable_ip; ++#else ++ return 0; ++#endif ++} ++ ++/* ++ * The time_slice is only refilled when it is empty and that is when we set a ++ * new deadline. Make sure update_clocks has been called recently to update ++ * rq->niffies. ++ */ ++static void time_slice_expired(struct task_struct *p, struct rq *rq) ++{ ++ p->time_slice = timeslice(); ++ p->deadline = rq->niffies + task_deadline_diff(p); ++#ifdef CONFIG_SMT_NICE ++ if (!p->mm) ++ p->smt_bias = 0; ++ else if (rt_task(p)) ++ p->smt_bias = 1 << 30; ++ else if (task_running_iso(p)) ++ p->smt_bias = 1 << 29; ++ else if (idleprio_task(p)) { ++ if (task_running_idle(p)) ++ p->smt_bias = 0; ++ else ++ p->smt_bias = 1; ++ } else if (--p->smt_bias < 1) ++ p->smt_bias = MAX_PRIO - p->static_prio; ++#endif ++} ++ ++/* ++ * Timeslices below RESCHED_US are considered as good as expired as there's no ++ * point rescheduling when there's so little time left. SCHED_BATCH tasks ++ * have been flagged be not latency sensitive and likely to be fully CPU ++ * bound so every time they're rescheduled they have their time_slice ++ * refilled, but get a new later deadline to have little effect on ++ * SCHED_NORMAL tasks. ++ ++ */ ++static inline void check_deadline(struct task_struct *p, struct rq *rq) ++{ ++ if (p->time_slice < RESCHED_US || batch_task(p)) ++ time_slice_expired(p, rq); ++} ++ ++/* ++ * Task selection with skiplists is a simple matter of picking off the first ++ * task in the sorted list, an O(1) operation. The lookup is amortised O(1) ++ * being bound to the number of processors. ++ * ++ * Runqueues are selectively locked based on their unlocked data and then ++ * unlocked if not needed. At most 3 locks will be held at any time and are ++ * released as soon as they're no longer needed. All balancing between CPUs ++ * is thus done here in an extremely simple first come best fit manner. ++ * ++ * This iterates over runqueues in cache locality order. In interactive mode ++ * it iterates over all CPUs and finds the task with the best key/deadline. ++ * In non-interactive mode it will only take a task if it's from the current ++ * runqueue or a runqueue with more tasks than the current one with a better ++ * key/deadline. ++ */ ++#ifdef CONFIG_SMP ++static inline struct task_struct ++*earliest_deadline_task(struct rq *rq, int cpu, struct task_struct *idle) ++{ ++ struct rq *locked = NULL, *chosen = NULL; ++ struct task_struct *edt = idle; ++ int i, best_entries = 0; ++ u64 best_key = ~0ULL; ++ ++ for (i = 0; i < total_runqueues; i++) { ++ struct rq *other_rq = rq_order(rq, i); ++ skiplist_node *next; ++ int entries; ++ ++ entries = other_rq->sl->entries; ++ /* ++ * Check for queued entres lockless first. The local runqueue ++ * is locked so entries will always be accurate. ++ */ ++ if (!sched_interactive) { ++ /* ++ * Don't reschedule balance across nodes unless the CPU ++ * is idle. ++ */ ++ if (edt != idle && rq->cpu_locality[other_rq->cpu] > LOCALITY_SMP) ++ break; ++ if (entries <= best_entries) ++ continue; ++ } else if (!entries) ++ continue; ++ ++ /* if (i) implies other_rq != rq */ ++ if (i) { ++ /* Check for best id queued lockless first */ ++ if (other_rq->best_key >= best_key) ++ continue; ++ ++ if (unlikely(!trylock_rq(rq, other_rq))) ++ continue; ++ ++ /* Need to reevaluate entries after locking */ ++ entries = other_rq->sl->entries; ++ if (unlikely(!entries)) { ++ unlock_rq(other_rq); ++ continue; ++ } ++ } ++ ++ next = other_rq->node; ++ /* ++ * In interactive mode we check beyond the best entry on other ++ * runqueues if we can't get the best for smt or affinity ++ * reasons. ++ */ ++ while ((next = next->next[0]) != other_rq->node) { ++ struct task_struct *p; ++ u64 key = next->key; ++ ++ /* Reevaluate key after locking */ ++ if (key >= best_key) ++ break; ++ ++ p = next->value; ++ if (!smt_schedule(p, rq)) { ++ if (i && !sched_interactive) ++ break; ++ continue; ++ } ++ ++ if (sched_other_cpu(p, cpu)) { ++ if (sched_interactive || !i) ++ continue; ++ break; ++ } ++ /* Make sure affinity is ok */ ++ if (i) { ++ /* From this point on p is the best so far */ ++ if (locked) ++ unlock_rq(locked); ++ chosen = locked = other_rq; ++ } ++ best_entries = entries; ++ best_key = key; ++ edt = p; ++ break; ++ } ++ /* rq->preempting is a hint only as the state may have changed ++ * since it was set with the resched call but if we have met ++ * the condition we can break out here. */ ++ if (edt == rq->preempting) ++ break; ++ if (i && other_rq != chosen) ++ unlock_rq(other_rq); ++ } ++ ++ if (likely(edt != idle)) ++ take_task(rq, cpu, edt); ++ ++ if (locked) ++ unlock_rq(locked); ++ ++ rq->preempting = NULL; ++ ++ return edt; ++} ++#else /* CONFIG_SMP */ ++static inline struct task_struct ++*earliest_deadline_task(struct rq *rq, int cpu, struct task_struct *idle) ++{ ++ struct task_struct *edt; ++ ++ if (unlikely(!rq->sl->entries)) ++ return idle; ++ edt = rq->node->next[0]->value; ++ take_task(rq, cpu, edt); ++ return edt; ++} ++#endif /* CONFIG_SMP */ ++ ++/* ++ * Print scheduling while atomic bug: ++ */ ++static noinline void __schedule_bug(struct task_struct *prev) ++{ ++ /* Save this before calling printk(), since that will clobber it */ ++ unsigned long preempt_disable_ip = get_preempt_disable_ip(current); ++ ++ if (oops_in_progress) ++ return; ++ ++ printk(KERN_ERR "BUG: scheduling while atomic: %s/%d/0x%08x\n", ++ prev->comm, prev->pid, preempt_count()); ++ ++ debug_show_held_locks(prev); ++ print_modules(); ++ if (irqs_disabled()) ++ print_irqtrace_events(prev); ++ if (IS_ENABLED(CONFIG_DEBUG_PREEMPT) ++ && in_atomic_preempt_off()) { ++ pr_err("Preemption disabled at:"); ++ print_ip_sym(preempt_disable_ip); ++ pr_cont("\n"); ++ } ++ dump_stack(); ++ add_taint(TAINT_WARN, LOCKDEP_STILL_OK); ++} ++ ++/* ++ * Various schedule()-time debugging checks and statistics: ++ */ ++static inline void schedule_debug(struct task_struct *prev, bool preempt) ++{ ++#ifdef CONFIG_SCHED_STACK_END_CHECK ++ if (task_stack_end_corrupted(prev)) ++ panic("corrupted stack end detected inside scheduler\n"); ++#endif ++ ++#ifdef CONFIG_DEBUG_ATOMIC_SLEEP ++ if (!preempt && prev->state && prev->non_block_count) { ++ printk(KERN_ERR "BUG: scheduling in a non-blocking section: %s/%d/%i\n", ++ prev->comm, prev->pid, prev->non_block_count); ++ dump_stack(); ++ add_taint(TAINT_WARN, LOCKDEP_STILL_OK); ++ } ++#endif ++ ++ if (unlikely(in_atomic_preempt_off())) { ++ __schedule_bug(prev); ++ preempt_count_set(PREEMPT_DISABLED); ++ } ++ rcu_sleep_check(); ++ ++ profile_hit(SCHED_PROFILING, __builtin_return_address(0)); ++ ++ schedstat_inc(this_rq()->sched_count); ++} ++ ++/* ++ * The currently running task's information is all stored in rq local data ++ * which is only modified by the local CPU. ++ */ ++static inline void set_rq_task(struct rq *rq, struct task_struct *p) ++{ ++ if (p == rq->idle || p->policy == SCHED_FIFO) ++ hrexpiry_clear(rq); ++ else ++ hrexpiry_start(rq, US_TO_NS(p->time_slice)); ++ if (rq->clock - rq->last_tick > HALF_JIFFY_NS) ++ rq->dither = 0; ++ else ++ rq->dither = rq_dither(rq); ++ ++ rq->rq_deadline = p->deadline; ++ rq->rq_prio = p->prio; ++#ifdef CONFIG_SMT_NICE ++ rq->rq_mm = p->mm; ++ rq->rq_smt_bias = p->smt_bias; ++#endif ++} ++ ++#ifdef CONFIG_SMT_NICE ++static void check_no_siblings(struct rq __maybe_unused *this_rq) {} ++static void wake_no_siblings(struct rq __maybe_unused *this_rq) {} ++static void (*check_siblings)(struct rq *this_rq) = &check_no_siblings; ++static void (*wake_siblings)(struct rq *this_rq) = &wake_no_siblings; ++ ++/* Iterate over smt siblings when we've scheduled a process on cpu and decide ++ * whether they should continue running or be descheduled. */ ++static void check_smt_siblings(struct rq *this_rq) ++{ ++ int other_cpu; ++ ++ for_each_cpu(other_cpu, &this_rq->thread_mask) { ++ struct task_struct *p; ++ struct rq *rq; ++ ++ rq = cpu_rq(other_cpu); ++ if (rq_idle(rq)) ++ continue; ++ p = rq->curr; ++ if (!smt_schedule(p, this_rq)) ++ resched_curr(rq); ++ } ++} ++ ++static void wake_smt_siblings(struct rq *this_rq) ++{ ++ int other_cpu; ++ ++ for_each_cpu(other_cpu, &this_rq->thread_mask) { ++ struct rq *rq; ++ ++ rq = cpu_rq(other_cpu); ++ if (rq_idle(rq)) ++ resched_idle(rq); ++ } ++} ++#else ++static void check_siblings(struct rq __maybe_unused *this_rq) {} ++static void wake_siblings(struct rq __maybe_unused *this_rq) {} ++#endif ++ ++/* ++ * schedule() is the main scheduler function. ++ * ++ * The main means of driving the scheduler and thus entering this function are: ++ * ++ * 1. Explicit blocking: mutex, semaphore, waitqueue, etc. ++ * ++ * 2. TIF_NEED_RESCHED flag is checked on interrupt and userspace return ++ * paths. For example, see arch/x86/entry_64.S. ++ * ++ * To drive preemption between tasks, the scheduler sets the flag in timer ++ * interrupt handler scheduler_tick(). ++ * ++ * 3. Wakeups don't really cause entry into schedule(). They add a ++ * task to the run-queue and that's it. ++ * ++ * Now, if the new task added to the run-queue preempts the current ++ * task, then the wakeup sets TIF_NEED_RESCHED and schedule() gets ++ * called on the nearest possible occasion: ++ * ++ * - If the kernel is preemptible (CONFIG_PREEMPTION=y): ++ * ++ * - in syscall or exception context, at the next outmost ++ * preempt_enable(). (this might be as soon as the wake_up()'s ++ * spin_unlock()!) ++ * ++ * - in IRQ context, return from interrupt-handler to ++ * preemptible context ++ * ++ * - If the kernel is not preemptible (CONFIG_PREEMPTION is not set) ++ * then at the next: ++ * ++ * - cond_resched() call ++ * - explicit schedule() call ++ * - return from syscall or exception to user-space ++ * - return from interrupt-handler to user-space ++ * ++ * WARNING: must be called with preemption disabled! ++ */ ++static void __sched notrace __schedule(bool preempt) ++{ ++ struct task_struct *prev, *next, *idle; ++ unsigned long *switch_count; ++ bool deactivate = false; ++ struct rq *rq; ++ u64 niffies; ++ int cpu; ++ ++ cpu = smp_processor_id(); ++ rq = cpu_rq(cpu); ++ prev = rq->curr; ++ idle = rq->idle; ++ ++ schedule_debug(prev, preempt); ++ ++ local_irq_disable(); ++ rcu_note_context_switch(preempt); ++ ++ /* ++ * Make sure that signal_pending_state()->signal_pending() below ++ * can't be reordered with __set_current_state(TASK_INTERRUPTIBLE) ++ * done by the caller to avoid the race with signal_wake_up(). ++ * ++ * The membarrier system call requires a full memory barrier ++ * after coming from user-space, before storing to rq->curr. ++ */ ++ rq_lock(rq); ++ smp_mb__after_spinlock(); ++#ifdef CONFIG_SMP ++ if (rq->preempt) { ++ /* ++ * Make sure resched_curr hasn't triggered a preemption ++ * locklessly on a task that has since scheduled away. Spurious ++ * wakeup of idle is okay though. ++ */ ++ if (unlikely(preempt && prev != idle && !test_tsk_need_resched(prev))) { ++ rq->preempt = NULL; ++ clear_preempt_need_resched(); ++ rq_unlock_irq(rq, NULL); ++ return; ++ } ++ rq->preempt = NULL; ++ } ++#endif ++ ++ switch_count = &prev->nivcsw; ++ if (!preempt && prev->state) { ++ if (signal_pending_state(prev->state, prev)) { ++ prev->state = TASK_RUNNING; ++ } else { ++ deactivate = true; ++ ++ if (prev->in_iowait) { ++ atomic_inc(&rq->nr_iowait); ++ delayacct_blkio_start(); ++ } ++ } ++ switch_count = &prev->nvcsw; ++ } ++ ++ /* ++ * Store the niffy value here for use by the next task's last_ran ++ * below to avoid losing niffies due to update_clocks being called ++ * again after this point. ++ */ ++ update_clocks(rq); ++ niffies = rq->niffies; ++ update_cpu_clock_switch(rq, prev); ++ ++ clear_tsk_need_resched(prev); ++ clear_preempt_need_resched(); ++ ++ if (idle != prev) { ++ check_deadline(prev, rq); ++ return_task(prev, rq, cpu, deactivate); ++ } ++ ++ next = earliest_deadline_task(rq, cpu, idle); ++ if (likely(next->prio != PRIO_LIMIT)) ++ clear_cpuidle_map(cpu); ++ else { ++ set_cpuidle_map(cpu); ++ update_load_avg(rq, 0); ++ } ++ ++ set_rq_task(rq, next); ++ next->last_ran = niffies; ++ ++ if (likely(prev != next)) { ++ /* ++ * Don't reschedule an idle task or deactivated tasks ++ */ ++ if (prev == idle) { ++ rq->nr_running++; ++ if (rt_task(next)) ++ rq->rt_nr_running++; ++ } else if (!deactivate) ++ resched_suitable_idle(prev); ++ if (unlikely(next == idle)) { ++ rq->nr_running--; ++ if (rt_task(prev)) ++ rq->rt_nr_running--; ++ wake_siblings(rq); ++ } else ++ check_siblings(rq); ++ rq->nr_switches++; ++ /* ++ * RCU users of rcu_dereference(rq->curr) may not see ++ * changes to task_struct made by pick_next_task(). ++ */ ++ RCU_INIT_POINTER(rq->curr, next); ++ /* ++ * The membarrier system call requires each architecture ++ * to have a full memory barrier after updating ++ * rq->curr, before returning to user-space. ++ * ++ * Here are the schemes providing that barrier on the ++ * various architectures: ++ * - mm ? switch_mm() : mmdrop() for x86, s390, sparc, PowerPC. ++ * switch_mm() rely on membarrier_arch_switch_mm() on PowerPC. ++ * - finish_lock_switch() for weakly-ordered ++ * architectures where spin_unlock is a full barrier, ++ * - switch_to() for arm64 (weakly-ordered, spin_unlock ++ * is a RELEASE barrier), ++ */ ++ ++*switch_count; ++ ++ trace_sched_switch(preempt, prev, next); ++ context_switch(rq, prev, next); /* unlocks the rq */ ++ } else { ++ check_siblings(rq); ++ rq_unlock(rq); ++ do_pending_softirq(rq, next); ++ local_irq_enable(); ++ } ++} ++ ++void __noreturn do_task_dead(void) ++{ ++ /* Causes final put_task_struct in finish_task_switch(). */ ++ set_special_state(TASK_DEAD); ++ ++ /* Tell freezer to ignore us: */ ++ current->flags |= PF_NOFREEZE; ++ __schedule(false); ++ BUG(); ++ ++ /* Avoid "noreturn function does return" - but don't continue if BUG() is a NOP: */ ++ for (;;) ++ cpu_relax(); ++} ++ ++static inline void sched_submit_work(struct task_struct *tsk) ++{ ++ if (!tsk->state) ++ return; ++ ++ /* ++ * If a worker went to sleep, notify and ask workqueue whether ++ * it wants to wake up a task to maintain concurrency. ++ * As this function is called inside the schedule() context, ++ * we disable preemption to avoid it calling schedule() again ++ * in the possible wakeup of a kworker. ++ */ ++ if (tsk->flags & (PF_WQ_WORKER | PF_IO_WORKER)) { ++ preempt_disable(); ++ if (tsk->flags & PF_WQ_WORKER) ++ wq_worker_sleeping(tsk); ++ else ++ io_wq_worker_sleeping(tsk); ++ preempt_enable_no_resched(); ++ } ++ ++ if (tsk_is_pi_blocked(tsk)) ++ return; ++ ++ /* ++ * If we are going to sleep and we have plugged IO queued, ++ * make sure to submit it to avoid deadlocks. ++ */ ++ if (blk_needs_flush_plug(tsk)) ++ blk_schedule_flush_plug(tsk); ++} ++ ++static inline void sched_update_worker(struct task_struct *tsk) ++{ ++ if (tsk->flags & (PF_WQ_WORKER | PF_IO_WORKER)) { ++ if (tsk->flags & PF_WQ_WORKER) ++ wq_worker_running(tsk); ++ else ++ io_wq_worker_running(tsk); ++ } ++} ++ ++asmlinkage __visible void __sched schedule(void) ++{ ++ struct task_struct *tsk = current; ++ ++ sched_submit_work(tsk); ++ do { ++ preempt_disable(); ++ __schedule(false); ++ sched_preempt_enable_no_resched(); ++ } while (need_resched()); ++ sched_update_worker(tsk); ++} ++ ++EXPORT_SYMBOL(schedule); ++ ++/* ++ * synchronize_rcu_tasks() makes sure that no task is stuck in preempted ++ * state (have scheduled out non-voluntarily) by making sure that all ++ * tasks have either left the run queue or have gone into user space. ++ * As idle tasks do not do either, they must not ever be preempted ++ * (schedule out non-voluntarily). ++ * ++ * schedule_idle() is similar to schedule_preempt_disable() except that it ++ * never enables preemption because it does not call sched_submit_work(). ++ */ ++void __sched schedule_idle(void) ++{ ++ /* ++ * As this skips calling sched_submit_work(), which the idle task does ++ * regardless because that function is a nop when the task is in a ++ * TASK_RUNNING state, make sure this isn't used someplace that the ++ * current task can be in any other state. Note, idle is always in the ++ * TASK_RUNNING state. ++ */ ++ WARN_ON_ONCE(current->state); ++ do { ++ __schedule(false); ++ } while (need_resched()); ++} ++ ++#ifdef CONFIG_CONTEXT_TRACKING ++asmlinkage __visible void __sched schedule_user(void) ++{ ++ /* ++ * If we come here after a random call to set_need_resched(), ++ * or we have been woken up remotely but the IPI has not yet arrived, ++ * we haven't yet exited the RCU idle mode. Do it here manually until ++ * we find a better solution. ++ * ++ * NB: There are buggy callers of this function. Ideally we ++ * should warn if prev_state != IN_USER, but that will trigger ++ * too frequently to make sense yet. ++ */ ++ enum ctx_state prev_state = exception_enter(); ++ schedule(); ++ exception_exit(prev_state); ++} ++#endif ++ ++/** ++ * schedule_preempt_disabled - called with preemption disabled ++ * ++ * Returns with preemption disabled. Note: preempt_count must be 1 ++ */ ++void __sched schedule_preempt_disabled(void) ++{ ++ sched_preempt_enable_no_resched(); ++ schedule(); ++ preempt_disable(); ++} ++ ++static void __sched notrace preempt_schedule_common(void) ++{ ++ do { ++ /* ++ * Because the function tracer can trace preempt_count_sub() ++ * and it also uses preempt_enable/disable_notrace(), if ++ * NEED_RESCHED is set, the preempt_enable_notrace() called ++ * by the function tracer will call this function again and ++ * cause infinite recursion. ++ * ++ * Preemption must be disabled here before the function ++ * tracer can trace. Break up preempt_disable() into two ++ * calls. One to disable preemption without fear of being ++ * traced. The other to still record the preemption latency, ++ * which can also be traced by the function tracer. ++ */ ++ preempt_disable_notrace(); ++ preempt_latency_start(1); ++ __schedule(true); ++ preempt_latency_stop(1); ++ preempt_enable_no_resched_notrace(); ++ ++ /* ++ * Check again in case we missed a preemption opportunity ++ * between schedule and now. ++ */ ++ } while (need_resched()); ++} ++ ++#ifdef CONFIG_PREEMPTION ++/* ++ * This is the entry point to schedule() from in-kernel preemption ++ * off of preempt_enable. ++ */ ++asmlinkage __visible void __sched notrace preempt_schedule(void) ++{ ++ /* ++ * If there is a non-zero preempt_count or interrupts are disabled, ++ * we do not want to preempt the current task. Just return.. ++ */ ++ if (likely(!preemptible())) ++ return; ++ ++ preempt_schedule_common(); ++} ++NOKPROBE_SYMBOL(preempt_schedule); ++EXPORT_SYMBOL(preempt_schedule); ++ ++/** ++ * preempt_schedule_notrace - preempt_schedule called by tracing ++ * ++ * The tracing infrastructure uses preempt_enable_notrace to prevent ++ * recursion and tracing preempt enabling caused by the tracing ++ * infrastructure itself. But as tracing can happen in areas coming ++ * from userspace or just about to enter userspace, a preempt enable ++ * can occur before user_exit() is called. This will cause the scheduler ++ * to be called when the system is still in usermode. ++ * ++ * To prevent this, the preempt_enable_notrace will use this function ++ * instead of preempt_schedule() to exit user context if needed before ++ * calling the scheduler. ++ */ ++asmlinkage __visible void __sched notrace preempt_schedule_notrace(void) ++{ ++ enum ctx_state prev_ctx; ++ ++ if (likely(!preemptible())) ++ return; ++ ++ do { ++ /* ++ * Because the function tracer can trace preempt_count_sub() ++ * and it also uses preempt_enable/disable_notrace(), if ++ * NEED_RESCHED is set, the preempt_enable_notrace() called ++ * by the function tracer will call this function again and ++ * cause infinite recursion. ++ * ++ * Preemption must be disabled here before the function ++ * tracer can trace. Break up preempt_disable() into two ++ * calls. One to disable preemption without fear of being ++ * traced. The other to still record the preemption latency, ++ * which can also be traced by the function tracer. ++ */ ++ preempt_disable_notrace(); ++ preempt_latency_start(1); ++ /* ++ * Needs preempt disabled in case user_exit() is traced ++ * and the tracer calls preempt_enable_notrace() causing ++ * an infinite recursion. ++ */ ++ prev_ctx = exception_enter(); ++ __schedule(true); ++ exception_exit(prev_ctx); ++ ++ preempt_latency_stop(1); ++ preempt_enable_no_resched_notrace(); ++ } while (need_resched()); ++} ++EXPORT_SYMBOL_GPL(preempt_schedule_notrace); ++ ++#endif /* CONFIG_PREEMPTION */ ++ ++/* ++ * This is the entry point to schedule() from kernel preemption ++ * off of irq context. ++ * Note, that this is called and return with irqs disabled. This will ++ * protect us against recursive calling from irq. ++ */ ++asmlinkage __visible void __sched preempt_schedule_irq(void) ++{ ++ enum ctx_state prev_state; ++ ++ /* Catch callers which need to be fixed */ ++ BUG_ON(preempt_count() || !irqs_disabled()); ++ ++ prev_state = exception_enter(); ++ ++ do { ++ preempt_disable(); ++ local_irq_enable(); ++ __schedule(true); ++ local_irq_disable(); ++ sched_preempt_enable_no_resched(); ++ } while (need_resched()); ++ ++ exception_exit(prev_state); ++} ++ ++int default_wake_function(wait_queue_entry_t *curr, unsigned mode, int wake_flags, ++ void *key) ++{ ++ return try_to_wake_up(curr->private, mode, wake_flags); ++} ++EXPORT_SYMBOL(default_wake_function); ++ ++#ifdef CONFIG_RT_MUTEXES ++ ++static inline int __rt_effective_prio(struct task_struct *pi_task, int prio) ++{ ++ if (pi_task) ++ prio = min(prio, pi_task->prio); ++ ++ return prio; ++} ++ ++static inline int rt_effective_prio(struct task_struct *p, int prio) ++{ ++ struct task_struct *pi_task = rt_mutex_get_top_task(p); ++ ++ return __rt_effective_prio(pi_task, prio); ++} ++ ++/* ++ * rt_mutex_setprio - set the current priority of a task ++ * @p: task to boost ++ * @pi_task: donor task ++ * ++ * This function changes the 'effective' priority of a task. It does ++ * not touch ->normal_prio like __setscheduler(). ++ * ++ * Used by the rt_mutex code to implement priority inheritance ++ * logic. Call site only calls if the priority of the task changed. ++ */ ++void rt_mutex_setprio(struct task_struct *p, struct task_struct *pi_task) ++{ ++ int prio, oldprio; ++ struct rq *rq; ++ ++ /* XXX used to be waiter->prio, not waiter->task->prio */ ++ prio = __rt_effective_prio(pi_task, p->normal_prio); ++ ++ /* ++ * If nothing changed; bail early. ++ */ ++ if (p->pi_top_task == pi_task && prio == p->prio) ++ return; ++ ++ rq = __task_rq_lock(p, NULL); ++ update_rq_clock(rq); ++ /* ++ * Set under pi_lock && rq->lock, such that the value can be used under ++ * either lock. ++ * ++ * Note that there is loads of tricky to make this pointer cache work ++ * right. rt_mutex_slowunlock()+rt_mutex_postunlock() work together to ++ * ensure a task is de-boosted (pi_task is set to NULL) before the ++ * task is allowed to run again (and can exit). This ensures the pointer ++ * points to a blocked task -- which guaratees the task is present. ++ */ ++ p->pi_top_task = pi_task; ++ ++ /* ++ * For FIFO/RR we only need to set prio, if that matches we're done. ++ */ ++ if (prio == p->prio) ++ goto out_unlock; ++ ++ /* ++ * Idle task boosting is a nono in general. There is one ++ * exception, when PREEMPT_RT and NOHZ is active: ++ * ++ * The idle task calls get_next_timer_interrupt() and holds ++ * the timer wheel base->lock on the CPU and another CPU wants ++ * to access the timer (probably to cancel it). We can safely ++ * ignore the boosting request, as the idle CPU runs this code ++ * with interrupts disabled and will complete the lock ++ * protected section without being interrupted. So there is no ++ * real need to boost. ++ */ ++ if (unlikely(p == rq->idle)) { ++ WARN_ON(p != rq->curr); ++ WARN_ON(p->pi_blocked_on); ++ goto out_unlock; ++ } ++ ++ trace_sched_pi_setprio(p, pi_task); ++ oldprio = p->prio; ++ p->prio = prio; ++ if (task_running(rq, p)){ ++ if (prio > oldprio) ++ resched_task(p); ++ } else if (task_queued(p)) { ++ dequeue_task(rq, p, DEQUEUE_SAVE); ++ enqueue_task(rq, p, ENQUEUE_RESTORE); ++ if (prio < oldprio) ++ try_preempt(p, rq); ++ } ++out_unlock: ++ __task_rq_unlock(rq, NULL); ++} ++#else ++static inline int rt_effective_prio(struct task_struct *p, int prio) ++{ ++ return prio; ++} ++#endif ++ ++/* ++ * Adjust the deadline for when the priority is to change, before it's ++ * changed. ++ */ ++static inline void adjust_deadline(struct task_struct *p, int new_prio) ++{ ++ p->deadline += static_deadline_diff(new_prio) - task_deadline_diff(p); ++} ++ ++void set_user_nice(struct task_struct *p, long nice) ++{ ++ int new_static, old_static; ++ struct rq_flags rf; ++ struct rq *rq; ++ ++ if (task_nice(p) == nice || nice < MIN_NICE || nice > MAX_NICE) ++ return; ++ new_static = NICE_TO_PRIO(nice); ++ /* ++ * We have to be careful, if called from sys_setpriority(), ++ * the task might be in the middle of scheduling on another CPU. ++ */ ++ rq = task_rq_lock(p, &rf); ++ update_rq_clock(rq); ++ ++ /* ++ * The RT priorities are set via sched_setscheduler(), but we still ++ * allow the 'normal' nice value to be set - but as expected ++ * it wont have any effect on scheduling until the task is ++ * not SCHED_NORMAL/SCHED_BATCH: ++ */ ++ if (has_rt_policy(p)) { ++ p->static_prio = new_static; ++ goto out_unlock; ++ } ++ ++ adjust_deadline(p, new_static); ++ old_static = p->static_prio; ++ p->static_prio = new_static; ++ p->prio = effective_prio(p); ++ ++ if (task_queued(p)) { ++ dequeue_task(rq, p, DEQUEUE_SAVE); ++ enqueue_task(rq, p, ENQUEUE_RESTORE); ++ if (new_static < old_static) ++ try_preempt(p, rq); ++ } else if (task_running(rq, p)) { ++ set_rq_task(rq, p); ++ if (old_static < new_static) ++ resched_task(p); ++ } ++out_unlock: ++ task_rq_unlock(rq, p, &rf); ++} ++EXPORT_SYMBOL(set_user_nice); ++ ++/* ++ * can_nice - check if a task can reduce its nice value ++ * @p: task ++ * @nice: nice value ++ */ ++int can_nice(const struct task_struct *p, const int nice) ++{ ++ /* Convert nice value [19,-20] to rlimit style value [1,40] */ ++ int nice_rlim = nice_to_rlimit(nice); ++ ++ return (nice_rlim <= task_rlimit(p, RLIMIT_NICE) || ++ capable(CAP_SYS_NICE)); ++} ++ ++#ifdef __ARCH_WANT_SYS_NICE ++ ++/* ++ * sys_nice - change the priority of the current process. ++ * @increment: priority increment ++ * ++ * sys_setpriority is a more generic, but much slower function that ++ * does similar things. ++ */ ++SYSCALL_DEFINE1(nice, int, increment) ++{ ++ long nice, retval; ++ ++ /* ++ * Setpriority might change our priority at the same moment. ++ * We don't have to worry. Conceptually one call occurs first ++ * and we have a single winner. ++ */ ++ ++ increment = clamp(increment, -NICE_WIDTH, NICE_WIDTH); ++ nice = task_nice(current) + increment; ++ ++ nice = clamp_val(nice, MIN_NICE, MAX_NICE); ++ if (increment < 0 && !can_nice(current, nice)) ++ return -EPERM; ++ ++ retval = security_task_setnice(current, nice); ++ if (retval) ++ return retval; ++ ++ set_user_nice(current, nice); ++ return 0; ++} ++ ++#endif ++ ++/** ++ * task_prio - return the priority value of a given task. ++ * @p: the task in question. ++ * ++ * Return: The priority value as seen by users in /proc. ++ * RT tasks are offset by -100. Normal tasks are centered around 1, value goes ++ * from 0 (SCHED_ISO) up to 82 (nice +19 SCHED_IDLEPRIO). ++ */ ++int task_prio(const struct task_struct *p) ++{ ++ int delta, prio = p->prio - MAX_RT_PRIO; ++ ++ /* rt tasks and iso tasks */ ++ if (prio <= 0) ++ goto out; ++ ++ /* Convert to ms to avoid overflows */ ++ delta = NS_TO_MS(p->deadline - task_rq(p)->niffies); ++ if (unlikely(delta < 0)) ++ delta = 0; ++ delta = delta * 40 / ms_longest_deadline_diff(); ++ if (delta <= 80) ++ prio += delta; ++ if (idleprio_task(p)) ++ prio += 40; ++out: ++ return prio; ++} ++ ++/** ++ * idle_cpu - is a given CPU idle currently? ++ * @cpu: the processor in question. ++ * ++ * Return: 1 if the CPU is currently idle. 0 otherwise. ++ */ ++int idle_cpu(int cpu) ++{ ++ return cpu_curr(cpu) == cpu_rq(cpu)->idle; ++} ++ ++/** ++ * available_idle_cpu - is a given CPU idle for enqueuing work. ++ * @cpu: the CPU in question. ++ * ++ * Return: 1 if the CPU is currently idle. 0 otherwise. ++ */ ++int available_idle_cpu(int cpu) ++{ ++ if (!idle_cpu(cpu)) ++ return 0; ++ ++ if (vcpu_is_preempted(cpu)) ++ return 0; ++ ++ return 1; ++} ++ ++/** ++ * idle_task - return the idle task for a given CPU. ++ * @cpu: the processor in question. ++ * ++ * Return: The idle task for the CPU @cpu. ++ */ ++struct task_struct *idle_task(int cpu) ++{ ++ return cpu_rq(cpu)->idle; ++} ++ ++/** ++ * find_process_by_pid - find a process with a matching PID value. ++ * @pid: the pid in question. ++ * ++ * The task of @pid, if found. %NULL otherwise. ++ */ ++static inline struct task_struct *find_process_by_pid(pid_t pid) ++{ ++ return pid ? find_task_by_vpid(pid) : current; ++} ++ ++/* Actually do priority change: must hold rq lock. */ ++static void __setscheduler(struct task_struct *p, struct rq *rq, int policy, ++ int prio, const struct sched_attr *attr, ++ bool keep_boost) ++{ ++ int oldrtprio, oldprio; ++ ++ /* ++ * If params can't change scheduling class changes aren't allowed ++ * either. ++ */ ++ if (attr->sched_flags & SCHED_FLAG_KEEP_PARAMS) ++ return; ++ ++ p->policy = policy; ++ oldrtprio = p->rt_priority; ++ p->rt_priority = prio; ++ p->normal_prio = normal_prio(p); ++ oldprio = p->prio; ++ /* ++ * Keep a potential priority boosting if called from ++ * sched_setscheduler(). ++ */ ++ p->prio = normal_prio(p); ++ if (keep_boost) ++ p->prio = rt_effective_prio(p, p->prio); ++ ++ if (task_running(rq, p)) { ++ set_rq_task(rq, p); ++ resched_task(p); ++ } else if (task_queued(p)) { ++ dequeue_task(rq, p, DEQUEUE_SAVE); ++ enqueue_task(rq, p, ENQUEUE_RESTORE); ++ if (p->prio < oldprio || p->rt_priority > oldrtprio) ++ try_preempt(p, rq); ++ } ++} ++ ++/* ++ * Check the target process has a UID that matches the current process's ++ */ ++static bool check_same_owner(struct task_struct *p) ++{ ++ const struct cred *cred = current_cred(), *pcred; ++ bool match; ++ ++ rcu_read_lock(); ++ pcred = __task_cred(p); ++ match = (uid_eq(cred->euid, pcred->euid) || ++ uid_eq(cred->euid, pcred->uid)); ++ rcu_read_unlock(); ++ return match; ++} ++ ++static int __sched_setscheduler(struct task_struct *p, ++ const struct sched_attr *attr, ++ bool user, bool pi) ++{ ++ int retval, policy = attr->sched_policy, oldpolicy = -1, priority = attr->sched_priority; ++ unsigned long rlim_rtprio = 0; ++ struct rq_flags rf; ++ int reset_on_fork; ++ struct rq *rq; ++ ++ /* The pi code expects interrupts enabled */ ++ BUG_ON(pi && in_interrupt()); ++ ++ if (is_rt_policy(policy) && !capable(CAP_SYS_NICE)) { ++ unsigned long lflags; ++ ++ if (!lock_task_sighand(p, &lflags)) ++ return -ESRCH; ++ rlim_rtprio = task_rlimit(p, RLIMIT_RTPRIO); ++ unlock_task_sighand(p, &lflags); ++ if (rlim_rtprio) ++ goto recheck; ++ /* ++ * If the caller requested an RT policy without having the ++ * necessary rights, we downgrade the policy to SCHED_ISO. ++ * We also set the parameter to zero to pass the checks. ++ */ ++ policy = SCHED_ISO; ++ priority = 0; ++ } ++recheck: ++ /* Double check policy once rq lock held */ ++ if (policy < 0) { ++ reset_on_fork = p->sched_reset_on_fork; ++ policy = oldpolicy = p->policy; ++ } else { ++ reset_on_fork = !!(policy & SCHED_RESET_ON_FORK); ++ policy &= ~SCHED_RESET_ON_FORK; ++ ++ if (!SCHED_RANGE(policy)) ++ return -EINVAL; ++ } ++ ++ if (attr->sched_flags & ~(SCHED_FLAG_ALL | SCHED_FLAG_SUGOV)) ++ return -EINVAL; ++ ++ /* ++ * Valid priorities for SCHED_FIFO and SCHED_RR are ++ * 1..MAX_USER_RT_PRIO-1, valid priority for SCHED_NORMAL and ++ * SCHED_BATCH is 0. ++ */ ++ if (priority < 0 || ++ (p->mm && priority > MAX_USER_RT_PRIO - 1) || ++ (!p->mm && priority > MAX_RT_PRIO - 1)) ++ return -EINVAL; ++ if (is_rt_policy(policy) != (priority != 0)) ++ return -EINVAL; ++ ++ /* ++ * Allow unprivileged RT tasks to decrease priority: ++ */ ++ if (user && !capable(CAP_SYS_NICE)) { ++ if (is_rt_policy(policy)) { ++ unsigned long rlim_rtprio = ++ task_rlimit(p, RLIMIT_RTPRIO); ++ ++ /* Can't set/change the rt policy */ ++ if (policy != p->policy && !rlim_rtprio) ++ return -EPERM; ++ ++ /* Can't increase priority */ ++ if (priority > p->rt_priority && ++ priority > rlim_rtprio) ++ return -EPERM; ++ } else { ++ switch (p->policy) { ++ /* ++ * Can only downgrade policies but not back to ++ * SCHED_NORMAL ++ */ ++ case SCHED_ISO: ++ if (policy == SCHED_ISO) ++ goto out; ++ if (policy != SCHED_NORMAL) ++ return -EPERM; ++ break; ++ case SCHED_BATCH: ++ if (policy == SCHED_BATCH) ++ goto out; ++ if (policy != SCHED_IDLEPRIO) ++ return -EPERM; ++ break; ++ case SCHED_IDLEPRIO: ++ if (policy == SCHED_IDLEPRIO) ++ goto out; ++ return -EPERM; ++ default: ++ break; ++ } ++ } ++ ++ /* Can't change other user's priorities */ ++ if (!check_same_owner(p)) ++ return -EPERM; ++ ++ /* Normal users shall not reset the sched_reset_on_fork flag: */ ++ if (p->sched_reset_on_fork && !reset_on_fork) ++ return -EPERM; ++ } ++ ++ if (user) { ++ retval = security_task_setscheduler(p); ++ if (retval) ++ return retval; ++ } ++ ++ if (pi) ++ cpuset_read_lock(); ++ ++ /* ++ * Make sure no PI-waiters arrive (or leave) while we are ++ * changing the priority of the task: ++ * ++ * To be able to change p->policy safely, the runqueue lock must be ++ * held. ++ */ ++ rq = task_rq_lock(p, &rf); ++ update_rq_clock(rq); ++ ++ /* ++ * Changing the policy of the stop threads its a very bad idea: ++ */ ++ if (p == rq->stop) { ++ retval = -EINVAL; ++ goto unlock; ++ } ++ ++ /* ++ * If not changing anything there's no need to proceed further: ++ */ ++ if (unlikely(policy == p->policy && (!is_rt_policy(policy) || ++ priority == p->rt_priority))) { ++ retval = 0; ++ goto unlock; ++ } ++ ++ /* Re-check policy now with rq lock held */ ++ if (unlikely(oldpolicy != -1 && oldpolicy != p->policy)) { ++ policy = oldpolicy = -1; ++ task_rq_unlock(rq, p, &rf); ++ if (pi) ++ cpuset_read_unlock(); ++ goto recheck; ++ } ++ p->sched_reset_on_fork = reset_on_fork; ++ ++ __setscheduler(p, rq, policy, priority, attr, pi); ++ ++ /* Avoid rq from going away on us: */ ++ preempt_disable(); ++ task_rq_unlock(rq, p, &rf); ++ ++ if (pi) { ++ cpuset_read_unlock(); ++ rt_mutex_adjust_pi(p); ++ } ++ preempt_enable(); ++out: ++ return 0; ++ ++unlock: ++ task_rq_unlock(rq, p, &rf); ++ if (pi) ++ cpuset_read_unlock(); ++ return retval; ++} ++ ++static int _sched_setscheduler(struct task_struct *p, int policy, ++ const struct sched_param *param, bool check) ++{ ++ struct sched_attr attr = { ++ .sched_policy = policy, ++ .sched_priority = param->sched_priority, ++ .sched_nice = PRIO_TO_NICE(p->static_prio), ++ }; ++ ++ return __sched_setscheduler(p, &attr, check, true); ++} ++/** ++ * sched_setscheduler - change the scheduling policy and/or RT priority of a thread. ++ * @p: the task in question. ++ * @policy: new policy. ++ * @param: structure containing the new RT priority. ++ * ++ * Return: 0 on success. An error code otherwise. ++ * ++ * NOTE that the task may be already dead. ++ */ ++int sched_setscheduler(struct task_struct *p, int policy, ++ const struct sched_param *param) ++{ ++ return _sched_setscheduler(p, policy, param, true); ++} ++ ++EXPORT_SYMBOL_GPL(sched_setscheduler); ++ ++int sched_setattr(struct task_struct *p, const struct sched_attr *attr) ++{ ++ return __sched_setscheduler(p, attr, true, true); ++} ++EXPORT_SYMBOL_GPL(sched_setattr); ++ ++int sched_setattr_nocheck(struct task_struct *p, const struct sched_attr *attr) ++{ ++ return __sched_setscheduler(p, attr, false, true); ++} ++ ++/** ++ * sched_setscheduler_nocheck - change the scheduling policy and/or RT priority of a thread from kernelspace. ++ * @p: the task in question. ++ * @policy: new policy. ++ * @param: structure containing the new RT priority. ++ * ++ * Just like sched_setscheduler, only don't bother checking if the ++ * current context has permission. For example, this is needed in ++ * stop_machine(): we create temporary high priority worker threads, ++ * but our caller might not have that capability. ++ * ++ * Return: 0 on success. An error code otherwise. ++ */ ++int sched_setscheduler_nocheck(struct task_struct *p, int policy, ++ const struct sched_param *param) ++{ ++ return _sched_setscheduler(p, policy, param, false); ++} ++EXPORT_SYMBOL_GPL(sched_setscheduler_nocheck); ++ ++static int ++do_sched_setscheduler(pid_t pid, int policy, struct sched_param __user *param) ++{ ++ struct sched_param lparam; ++ struct task_struct *p; ++ int retval; ++ ++ if (!param || pid < 0) ++ return -EINVAL; ++ if (copy_from_user(&lparam, param, sizeof(struct sched_param))) ++ return -EFAULT; ++ ++ rcu_read_lock(); ++ retval = -ESRCH; ++ p = find_process_by_pid(pid); ++ if (likely(p)) ++ get_task_struct(p); ++ rcu_read_unlock(); ++ ++ if (likely(p)) { ++ retval = sched_setscheduler(p, policy, &lparam); ++ put_task_struct(p); ++ } ++ ++ return retval; ++} ++ ++/* ++ * Mimics kernel/events/core.c perf_copy_attr(). ++ */ ++static int sched_copy_attr(struct sched_attr __user *uattr, ++ struct sched_attr *attr) ++{ ++ u32 size; ++ int ret; ++ ++ /* Zero the full structure, so that a short copy will be nice: */ ++ memset(attr, 0, sizeof(*attr)); ++ ++ ret = get_user(size, &uattr->size); ++ if (ret) ++ return ret; ++ ++ /* ABI compatibility quirk: */ ++ if (!size) ++ size = SCHED_ATTR_SIZE_VER0; ++ ++ if (size < SCHED_ATTR_SIZE_VER0 || size > PAGE_SIZE) ++ goto err_size; ++ ++ ret = copy_struct_from_user(attr, sizeof(*attr), uattr, size); ++ if (ret) { ++ if (ret == -E2BIG) ++ goto err_size; ++ return ret; ++ } ++ ++ /* ++ * XXX: Do we want to be lenient like existing syscalls; or do we want ++ * to be strict and return an error on out-of-bounds values? ++ */ ++ attr->sched_nice = clamp(attr->sched_nice, -20, 19); ++ ++ /* sched/core.c uses zero here but we already know ret is zero */ ++ return 0; ++ ++err_size: ++ put_user(sizeof(*attr), &uattr->size); ++ return -E2BIG; ++} ++ ++/* ++ * sched_setparam() passes in -1 for its policy, to let the functions ++ * it calls know not to change it. ++ */ ++#define SETPARAM_POLICY -1 ++ ++/** ++ * sys_sched_setscheduler - set/change the scheduler policy and RT priority ++ * @pid: the pid in question. ++ * @policy: new policy. ++ * @param: structure containing the new RT priority. ++ * ++ * Return: 0 on success. An error code otherwise. ++ */ ++SYSCALL_DEFINE3(sched_setscheduler, pid_t, pid, int, policy, struct sched_param __user *, param) ++{ ++ if (policy < 0) ++ return -EINVAL; ++ ++ return do_sched_setscheduler(pid, policy, param); ++} ++ ++/** ++ * sys_sched_setparam - set/change the RT priority of a thread ++ * @pid: the pid in question. ++ * @param: structure containing the new RT priority. ++ * ++ * Return: 0 on success. An error code otherwise. ++ */ ++SYSCALL_DEFINE2(sched_setparam, pid_t, pid, struct sched_param __user *, param) ++{ ++ return do_sched_setscheduler(pid, SETPARAM_POLICY, param); ++} ++ ++/** ++ * sys_sched_setattr - same as above, but with extended sched_attr ++ * @pid: the pid in question. ++ * @uattr: structure containing the extended parameters. ++ */ ++SYSCALL_DEFINE3(sched_setattr, pid_t, pid, struct sched_attr __user *, uattr, ++ unsigned int, flags) ++{ ++ struct sched_attr attr; ++ struct task_struct *p; ++ int retval; ++ ++ if (!uattr || pid < 0 || flags) ++ return -EINVAL; ++ ++ retval = sched_copy_attr(uattr, &attr); ++ if (retval) ++ return retval; ++ ++ if ((int)attr.sched_policy < 0) ++ return -EINVAL; ++ if (attr.sched_flags & SCHED_FLAG_KEEP_POLICY) ++ attr.sched_policy = SETPARAM_POLICY; ++ ++ rcu_read_lock(); ++ retval = -ESRCH; ++ p = find_process_by_pid(pid); ++ if (likely(p)) ++ get_task_struct(p); ++ rcu_read_unlock(); ++ ++ if (likely(p)) { ++ retval = sched_setattr(p, &attr); ++ put_task_struct(p); ++ } ++ ++ return retval; ++} ++ ++/** ++ * sys_sched_getscheduler - get the policy (scheduling class) of a thread ++ * @pid: the pid in question. ++ * ++ * Return: On success, the policy of the thread. Otherwise, a negative error ++ * code. ++ */ ++SYSCALL_DEFINE1(sched_getscheduler, pid_t, pid) ++{ ++ struct task_struct *p; ++ int retval = -EINVAL; ++ ++ if (pid < 0) ++ goto out_nounlock; ++ ++ retval = -ESRCH; ++ rcu_read_lock(); ++ p = find_process_by_pid(pid); ++ if (p) { ++ retval = security_task_getscheduler(p); ++ if (!retval) ++ retval = p->policy; ++ } ++ rcu_read_unlock(); ++ ++out_nounlock: ++ return retval; ++} ++ ++/** ++ * sys_sched_getscheduler - get the RT priority of a thread ++ * @pid: the pid in question. ++ * @param: structure containing the RT priority. ++ * ++ * Return: On success, 0 and the RT priority is in @param. Otherwise, an error ++ * code. ++ */ ++SYSCALL_DEFINE2(sched_getparam, pid_t, pid, struct sched_param __user *, param) ++{ ++ struct sched_param lp = { .sched_priority = 0 }; ++ struct task_struct *p; ++ int retval = -EINVAL; ++ ++ if (!param || pid < 0) ++ goto out_nounlock; ++ ++ rcu_read_lock(); ++ p = find_process_by_pid(pid); ++ retval = -ESRCH; ++ if (!p) ++ goto out_unlock; ++ ++ retval = security_task_getscheduler(p); ++ if (retval) ++ goto out_unlock; ++ ++ if (has_rt_policy(p)) ++ lp.sched_priority = p->rt_priority; ++ rcu_read_unlock(); ++ ++ /* ++ * This one might sleep, we cannot do it with a spinlock held ... ++ */ ++ retval = copy_to_user(param, &lp, sizeof(*param)) ? -EFAULT : 0; ++ ++out_nounlock: ++ return retval; ++ ++out_unlock: ++ rcu_read_unlock(); ++ return retval; ++} ++ ++/* ++ * Copy the kernel size attribute structure (which might be larger ++ * than what user-space knows about) to user-space. ++ * ++ * Note that all cases are valid: user-space buffer can be larger or ++ * smaller than the kernel-space buffer. The usual case is that both ++ * have the same size. ++ */ ++static int ++sched_attr_copy_to_user(struct sched_attr __user *uattr, ++ struct sched_attr *kattr, ++ unsigned int usize) ++{ ++ unsigned int ksize = sizeof(*kattr); ++ ++ if (!access_ok(uattr, usize)) ++ return -EFAULT; ++ ++ /* ++ * sched_getattr() ABI forwards and backwards compatibility: ++ * ++ * If usize == ksize then we just copy everything to user-space and all is good. ++ * ++ * If usize < ksize then we only copy as much as user-space has space for, ++ * this keeps ABI compatibility as well. We skip the rest. ++ * ++ * If usize > ksize then user-space is using a newer version of the ABI, ++ * which part the kernel doesn't know about. Just ignore it - tooling can ++ * detect the kernel's knowledge of attributes from the attr->size value ++ * which is set to ksize in this case. ++ */ ++ kattr->size = min(usize, ksize); ++ ++ if (copy_to_user(uattr, kattr, kattr->size)) ++ return -EFAULT; ++ ++ return 0; ++} ++ ++/** ++ * sys_sched_getattr - similar to sched_getparam, but with sched_attr ++ * @pid: the pid in question. ++ * @uattr: structure containing the extended parameters. ++ * @usize: sizeof(attr) for fwd/bwd comp. ++ * @flags: for future extension. ++ */ ++SYSCALL_DEFINE4(sched_getattr, pid_t, pid, struct sched_attr __user *, uattr, ++ unsigned int, usize, unsigned int, flags) ++{ ++ struct sched_attr kattr = { }; ++ struct task_struct *p; ++ int retval; ++ ++ if (!uattr || pid < 0 || usize > PAGE_SIZE || ++ usize < SCHED_ATTR_SIZE_VER0 || flags) ++ return -EINVAL; ++ ++ rcu_read_lock(); ++ p = find_process_by_pid(pid); ++ retval = -ESRCH; ++ if (!p) ++ goto out_unlock; ++ ++ retval = security_task_getscheduler(p); ++ if (retval) ++ goto out_unlock; ++ ++ kattr.sched_policy = p->policy; ++ if (rt_task(p)) ++ kattr.sched_priority = p->rt_priority; ++ else ++ kattr.sched_nice = task_nice(p); ++ ++ rcu_read_unlock(); ++ ++ return sched_attr_copy_to_user(uattr, &kattr, usize); ++ ++out_unlock: ++ rcu_read_unlock(); ++ return retval; ++} ++ ++long sched_setaffinity(pid_t pid, const struct cpumask *in_mask) ++{ ++ cpumask_var_t cpus_allowed, new_mask; ++ struct task_struct *p; ++ int retval; ++ ++ rcu_read_lock(); ++ ++ p = find_process_by_pid(pid); ++ if (!p) { ++ rcu_read_unlock(); ++ return -ESRCH; ++ } ++ ++ /* Prevent p going away */ ++ get_task_struct(p); ++ rcu_read_unlock(); ++ ++ if (p->flags & PF_NO_SETAFFINITY) { ++ retval = -EINVAL; ++ goto out_put_task; ++ } ++ if (!alloc_cpumask_var(&cpus_allowed, GFP_KERNEL)) { ++ retval = -ENOMEM; ++ goto out_put_task; ++ } ++ if (!alloc_cpumask_var(&new_mask, GFP_KERNEL)) { ++ retval = -ENOMEM; ++ goto out_free_cpus_allowed; ++ } ++ retval = -EPERM; ++ if (!check_same_owner(p)) { ++ rcu_read_lock(); ++ if (!ns_capable(__task_cred(p)->user_ns, CAP_SYS_NICE)) { ++ rcu_read_unlock(); ++ goto out_unlock; ++ } ++ rcu_read_unlock(); ++ } ++ ++ retval = security_task_setscheduler(p); ++ if (retval) ++ goto out_unlock; ++ ++ cpuset_cpus_allowed(p, cpus_allowed); ++ cpumask_and(new_mask, in_mask, cpus_allowed); ++again: ++ retval = __set_cpus_allowed_ptr(p, new_mask, true); ++ ++ if (!retval) { ++ cpuset_cpus_allowed(p, cpus_allowed); ++ if (!cpumask_subset(new_mask, cpus_allowed)) { ++ /* ++ * We must have raced with a concurrent cpuset ++ * update. Just reset the cpus_allowed to the ++ * cpuset's cpus_allowed ++ */ ++ cpumask_copy(new_mask, cpus_allowed); ++ goto again; ++ } ++ } ++out_unlock: ++ free_cpumask_var(new_mask); ++out_free_cpus_allowed: ++ free_cpumask_var(cpus_allowed); ++out_put_task: ++ put_task_struct(p); ++ return retval; ++} ++ ++static int get_user_cpu_mask(unsigned long __user *user_mask_ptr, unsigned len, ++ cpumask_t *new_mask) ++{ ++ if (len < cpumask_size()) ++ cpumask_clear(new_mask); ++ else if (len > cpumask_size()) ++ len = cpumask_size(); ++ ++ return copy_from_user(new_mask, user_mask_ptr, len) ? -EFAULT : 0; ++} ++ ++ ++/** ++ * sys_sched_setaffinity - set the CPU affinity of a process ++ * @pid: pid of the process ++ * @len: length in bytes of the bitmask pointed to by user_mask_ptr ++ * @user_mask_ptr: user-space pointer to the new CPU mask ++ * ++ * Return: 0 on success. An error code otherwise. ++ */ ++SYSCALL_DEFINE3(sched_setaffinity, pid_t, pid, unsigned int, len, ++ unsigned long __user *, user_mask_ptr) ++{ ++ cpumask_var_t new_mask; ++ int retval; ++ ++ if (!alloc_cpumask_var(&new_mask, GFP_KERNEL)) ++ return -ENOMEM; ++ ++ retval = get_user_cpu_mask(user_mask_ptr, len, new_mask); ++ if (retval == 0) ++ retval = sched_setaffinity(pid, new_mask); ++ free_cpumask_var(new_mask); ++ return retval; ++} ++ ++long sched_getaffinity(pid_t pid, cpumask_t *mask) ++{ ++ struct task_struct *p; ++ unsigned long flags; ++ int retval; ++ ++ get_online_cpus(); ++ rcu_read_lock(); ++ ++ retval = -ESRCH; ++ p = find_process_by_pid(pid); ++ if (!p) ++ goto out_unlock; ++ ++ retval = security_task_getscheduler(p); ++ if (retval) ++ goto out_unlock; ++ ++ raw_spin_lock_irqsave(&p->pi_lock, flags); ++ cpumask_and(mask, &p->cpus_mask, cpu_active_mask); ++ raw_spin_unlock_irqrestore(&p->pi_lock, flags); ++ ++out_unlock: ++ rcu_read_unlock(); ++ put_online_cpus(); ++ ++ return retval; ++} ++ ++/** ++ * sys_sched_getaffinity - get the CPU affinity of a process ++ * @pid: pid of the process ++ * @len: length in bytes of the bitmask pointed to by user_mask_ptr ++ * @user_mask_ptr: user-space pointer to hold the current CPU mask ++ * ++ * Return: 0 on success. An error code otherwise. ++ */ ++SYSCALL_DEFINE3(sched_getaffinity, pid_t, pid, unsigned int, len, ++ unsigned long __user *, user_mask_ptr) ++{ ++ int ret; ++ cpumask_var_t mask; ++ ++ if ((len * BITS_PER_BYTE) < nr_cpu_ids) ++ return -EINVAL; ++ if (len & (sizeof(unsigned long)-1)) ++ return -EINVAL; ++ ++ if (!alloc_cpumask_var(&mask, GFP_KERNEL)) ++ return -ENOMEM; ++ ++ ret = sched_getaffinity(pid, mask); ++ if (ret == 0) { ++ unsigned int retlen = min(len, cpumask_size()); ++ ++ if (copy_to_user(user_mask_ptr, mask, retlen)) ++ ret = -EFAULT; ++ else ++ ret = retlen; ++ } ++ free_cpumask_var(mask); ++ ++ return ret; ++} ++ ++/** ++ * sys_sched_yield - yield the current processor to other threads. ++ * ++ * This function yields the current CPU to other tasks. It does this by ++ * scheduling away the current task. If it still has the earliest deadline ++ * it will be scheduled again as the next task. ++ * ++ * Return: 0. ++ */ ++static void do_sched_yield(void) ++{ ++ struct rq *rq; ++ ++ if (!sched_yield_type) ++ return; ++ ++ local_irq_disable(); ++ rq = this_rq(); ++ rq_lock(rq); ++ ++ if (sched_yield_type > 1) ++ time_slice_expired(current, rq); ++ schedstat_inc(rq->yld_count); ++ ++ /* ++ * Since we are going to call schedule() anyway, there's ++ * no need to preempt or enable interrupts: ++ */ ++ preempt_disable(); ++ rq_unlock(rq); ++ sched_preempt_enable_no_resched(); ++ ++ schedule(); ++} ++ ++SYSCALL_DEFINE0(sched_yield) ++{ ++ do_sched_yield(); ++ return 0; ++} ++ ++#ifndef CONFIG_PREEMPTION ++int __sched _cond_resched(void) ++{ ++ if (should_resched(0)) { ++ preempt_schedule_common(); ++ return 1; ++ } ++ rcu_all_qs(); ++ return 0; ++} ++EXPORT_SYMBOL(_cond_resched); ++#endif ++ ++/* ++ * __cond_resched_lock() - if a reschedule is pending, drop the given lock, ++ * call schedule, and on return reacquire the lock. ++ * ++ * This works OK both with and without CONFIG_PREEMPTION. We do strange low-level ++ * operations here to prevent schedule() from being called twice (once via ++ * spin_unlock(), once by hand). ++ */ ++int __cond_resched_lock(spinlock_t *lock) ++{ ++ int resched = should_resched(PREEMPT_LOCK_OFFSET); ++ int ret = 0; ++ ++ lockdep_assert_held(lock); ++ ++ if (spin_needbreak(lock) || resched) { ++ spin_unlock(lock); ++ if (resched) ++ preempt_schedule_common(); ++ else ++ cpu_relax(); ++ ret = 1; ++ spin_lock(lock); ++ } ++ return ret; ++} ++EXPORT_SYMBOL(__cond_resched_lock); ++ ++/** ++ * yield - yield the current processor to other threads. ++ * ++ * Do not ever use this function, there's a 99% chance you're doing it wrong. ++ * ++ * The scheduler is at all times free to pick the calling task as the most ++ * eligible task to run, if removing the yield() call from your code breaks ++ * it, its already broken. ++ * ++ * Typical broken usage is: ++ * ++ * while (!event) ++ * yield(); ++ * ++ * where one assumes that yield() will let 'the other' process run that will ++ * make event true. If the current task is a SCHED_FIFO task that will never ++ * happen. Never use yield() as a progress guarantee!! ++ * ++ * If you want to use yield() to wait for something, use wait_event(). ++ * If you want to use yield() to be 'nice' for others, use cond_resched(). ++ * If you still want to use yield(), do not! ++ */ ++void __sched yield(void) ++{ ++ set_current_state(TASK_RUNNING); ++ do_sched_yield(); ++} ++EXPORT_SYMBOL(yield); ++ ++/** ++ * yield_to - yield the current processor to another thread in ++ * your thread group, or accelerate that thread toward the ++ * processor it's on. ++ * @p: target task ++ * @preempt: whether task preemption is allowed or not ++ * ++ * It's the caller's job to ensure that the target task struct ++ * can't go away on us before we can do any checks. ++ * ++ * Return: ++ * true (>0) if we indeed boosted the target task. ++ * false (0) if we failed to boost the target. ++ * -ESRCH if there's no task to yield to. ++ */ ++int __sched yield_to(struct task_struct *p, bool preempt) ++{ ++ struct task_struct *rq_p; ++ struct rq *rq, *p_rq; ++ unsigned long flags; ++ int yielded = 0; ++ ++ local_irq_save(flags); ++ rq = this_rq(); ++ ++again: ++ p_rq = task_rq(p); ++ /* ++ * If we're the only runnable task on the rq and target rq also ++ * has only one task, there's absolutely no point in yielding. ++ */ ++ if (task_running(p_rq, p) || p->state) { ++ yielded = -ESRCH; ++ goto out_irq; ++ } ++ ++ double_rq_lock(rq, p_rq); ++ if (unlikely(task_rq(p) != p_rq)) { ++ double_rq_unlock(rq, p_rq); ++ goto again; ++ } ++ ++ yielded = 1; ++ schedstat_inc(rq->yld_count); ++ rq_p = rq->curr; ++ if (p->deadline > rq_p->deadline) ++ p->deadline = rq_p->deadline; ++ p->time_slice += rq_p->time_slice; ++ if (p->time_slice > timeslice()) ++ p->time_slice = timeslice(); ++ time_slice_expired(rq_p, rq); ++ if (preempt && rq != p_rq) ++ resched_task(p_rq->curr); ++ double_rq_unlock(rq, p_rq); ++out_irq: ++ local_irq_restore(flags); ++ ++ if (yielded > 0) ++ schedule(); ++ return yielded; ++} ++EXPORT_SYMBOL_GPL(yield_to); ++ ++int io_schedule_prepare(void) ++{ ++ int old_iowait = current->in_iowait; ++ ++ current->in_iowait = 1; ++ blk_schedule_flush_plug(current); ++ ++ return old_iowait; ++} ++ ++void io_schedule_finish(int token) ++{ ++ current->in_iowait = token; ++} ++ ++/* ++ * This task is about to go to sleep on IO. Increment rq->nr_iowait so ++ * that process accounting knows that this is a task in IO wait state. ++ * ++ * But don't do that if it is a deliberate, throttling IO wait (this task ++ * has set its backing_dev_info: the queue against which it should throttle) ++ */ ++ ++long __sched io_schedule_timeout(long timeout) ++{ ++ int token; ++ long ret; ++ ++ token = io_schedule_prepare(); ++ ret = schedule_timeout(timeout); ++ io_schedule_finish(token); ++ ++ return ret; ++} ++EXPORT_SYMBOL(io_schedule_timeout); ++ ++void __sched io_schedule(void) ++{ ++ int token; ++ ++ token = io_schedule_prepare(); ++ schedule(); ++ io_schedule_finish(token); ++} ++EXPORT_SYMBOL(io_schedule); ++ ++/** ++ * sys_sched_get_priority_max - return maximum RT priority. ++ * @policy: scheduling class. ++ * ++ * Return: On success, this syscall returns the maximum ++ * rt_priority that can be used by a given scheduling class. ++ * On failure, a negative error code is returned. ++ */ ++SYSCALL_DEFINE1(sched_get_priority_max, int, policy) ++{ ++ int ret = -EINVAL; ++ ++ switch (policy) { ++ case SCHED_FIFO: ++ case SCHED_RR: ++ ret = MAX_USER_RT_PRIO-1; ++ break; ++ case SCHED_NORMAL: ++ case SCHED_BATCH: ++ case SCHED_ISO: ++ case SCHED_IDLEPRIO: ++ ret = 0; ++ break; ++ } ++ return ret; ++} ++ ++/** ++ * sys_sched_get_priority_min - return minimum RT priority. ++ * @policy: scheduling class. ++ * ++ * Return: On success, this syscall returns the minimum ++ * rt_priority that can be used by a given scheduling class. ++ * On failure, a negative error code is returned. ++ */ ++SYSCALL_DEFINE1(sched_get_priority_min, int, policy) ++{ ++ int ret = -EINVAL; ++ ++ switch (policy) { ++ case SCHED_FIFO: ++ case SCHED_RR: ++ ret = 1; ++ break; ++ case SCHED_NORMAL: ++ case SCHED_BATCH: ++ case SCHED_ISO: ++ case SCHED_IDLEPRIO: ++ ret = 0; ++ break; ++ } ++ return ret; ++} ++ ++static int sched_rr_get_interval(pid_t pid, struct timespec64 *t) ++{ ++ struct task_struct *p; ++ unsigned int time_slice; ++ struct rq_flags rf; ++ struct rq *rq; ++ int retval; ++ ++ if (pid < 0) ++ return -EINVAL; ++ ++ retval = -ESRCH; ++ rcu_read_lock(); ++ p = find_process_by_pid(pid); ++ if (!p) ++ goto out_unlock; ++ ++ retval = security_task_getscheduler(p); ++ if (retval) ++ goto out_unlock; ++ ++ rq = task_rq_lock(p, &rf); ++ time_slice = p->policy == SCHED_FIFO ? 0 : MS_TO_NS(task_timeslice(p)); ++ task_rq_unlock(rq, p, &rf); ++ ++ rcu_read_unlock(); ++ *t = ns_to_timespec64(time_slice); ++ return 0; ++ ++out_unlock: ++ rcu_read_unlock(); ++ return retval; ++} ++ ++/** ++ * sys_sched_rr_get_interval - return the default timeslice of a process. ++ * @pid: pid of the process. ++ * @interval: userspace pointer to the timeslice value. ++ * ++ * this syscall writes the default timeslice value of a given process ++ * into the user-space timespec buffer. A value of '0' means infinity. ++ * ++ * Return: On success, 0 and the timeslice is in @interval. Otherwise, ++ * an error code. ++ */ ++SYSCALL_DEFINE2(sched_rr_get_interval, pid_t, pid, ++ struct __kernel_timespec __user *, interval) ++{ ++ struct timespec64 t; ++ int retval = sched_rr_get_interval(pid, &t); ++ ++ if (retval == 0) ++ retval = put_timespec64(&t, interval); ++ ++ return retval; ++} ++ ++#ifdef CONFIG_COMPAT_32BIT_TIME ++SYSCALL_DEFINE2(sched_rr_get_interval_time32, pid_t, pid, ++ struct old_timespec32 __user *, interval) ++{ ++ struct timespec64 t; ++ int retval = sched_rr_get_interval(pid, &t); ++ ++ if (retval == 0) ++ retval = put_old_timespec32(&t, interval); ++ return retval; ++} ++#endif ++ ++void sched_show_task(struct task_struct *p) ++{ ++ unsigned long free = 0; ++ int ppid; ++ ++ if (!try_get_task_stack(p)) ++ return; ++ ++ printk(KERN_INFO "%-15.15s %c", p->comm, task_state_to_char(p)); ++ ++ if (p->state == TASK_RUNNING) ++ printk(KERN_CONT " running task "); ++#ifdef CONFIG_DEBUG_STACK_USAGE ++ free = stack_not_used(p); ++#endif ++ ppid = 0; ++ rcu_read_lock(); ++ if (pid_alive(p)) ++ ppid = task_pid_nr(rcu_dereference(p->real_parent)); ++ rcu_read_unlock(); ++ printk(KERN_CONT "%5lu %5d %6d 0x%08lx\n", free, ++ task_pid_nr(p), ppid, ++ (unsigned long)task_thread_info(p)->flags); ++ ++ print_worker_info(KERN_INFO, p); ++ show_stack(p, NULL); ++ put_task_stack(p); ++} ++EXPORT_SYMBOL_GPL(sched_show_task); ++ ++static inline bool ++state_filter_match(unsigned long state_filter, struct task_struct *p) ++{ ++ /* no filter, everything matches */ ++ if (!state_filter) ++ return true; ++ ++ /* filter, but doesn't match */ ++ if (!(p->state & state_filter)) ++ return false; ++ ++ /* ++ * When looking for TASK_UNINTERRUPTIBLE skip TASK_IDLE (allows ++ * TASK_KILLABLE). ++ */ ++ if (state_filter == TASK_UNINTERRUPTIBLE && p->state == TASK_IDLE) ++ return false; ++ ++ return true; ++} ++ ++void show_state_filter(unsigned long state_filter) ++{ ++ struct task_struct *g, *p; ++ ++#if BITS_PER_LONG == 32 ++ printk(KERN_INFO ++ " task PC stack pid father\n"); ++#else ++ printk(KERN_INFO ++ " task PC stack pid father\n"); ++#endif ++ rcu_read_lock(); ++ for_each_process_thread(g, p) { ++ /* ++ * reset the NMI-timeout, listing all files on a slow ++ * console might take a lot of time: ++ * Also, reset softlockup watchdogs on all CPUs, because ++ * another CPU might be blocked waiting for us to process ++ * an IPI. ++ */ ++ touch_nmi_watchdog(); ++ touch_all_softlockup_watchdogs(); ++ if (state_filter_match(state_filter, p)) ++ sched_show_task(p); ++ } ++ ++ rcu_read_unlock(); ++ /* ++ * Only show locks if all tasks are dumped: ++ */ ++ if (!state_filter) ++ debug_show_all_locks(); ++} ++ ++void dump_cpu_task(int cpu) ++{ ++ pr_info("Task dump for CPU %d:\n", cpu); ++ sched_show_task(cpu_curr(cpu)); ++} ++ ++#ifdef CONFIG_SMP ++void set_cpus_allowed_common(struct task_struct *p, const struct cpumask *new_mask) ++{ ++ cpumask_copy(&p->cpus_mask, new_mask); ++ p->nr_cpus_allowed = cpumask_weight(new_mask); ++} ++ ++void __do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask) ++{ ++ struct rq *rq = task_rq(p); ++ ++ lockdep_assert_held(&p->pi_lock); ++ ++ cpumask_copy(&p->cpus_mask, new_mask); ++ ++ if (task_queued(p)) { ++ /* ++ * Because __kthread_bind() calls this on blocked tasks without ++ * holding rq->lock. ++ */ ++ lockdep_assert_held(rq->lock); ++ } ++} ++ ++/* ++ * Calling do_set_cpus_allowed from outside the scheduler code should not be ++ * called on a running or queued task. We should be holding pi_lock. ++ */ ++void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask) ++{ ++ __do_set_cpus_allowed(p, new_mask); ++ if (needs_other_cpu(p, task_cpu(p))) { ++ struct rq *rq; ++ ++ rq = __task_rq_lock(p, NULL); ++ set_task_cpu(p, valid_task_cpu(p)); ++ resched_task(p); ++ __task_rq_unlock(rq, NULL); ++ } ++} ++#endif ++ ++/** ++ * init_idle - set up an idle thread for a given CPU ++ * @idle: task in question ++ * @cpu: cpu the idle task belongs to ++ * ++ * NOTE: this function does not set the idle thread's NEED_RESCHED ++ * flag, to make booting more robust. ++ */ ++void init_idle(struct task_struct *idle, int cpu) ++{ ++ struct rq *rq = cpu_rq(cpu); ++ unsigned long flags; ++ ++ raw_spin_lock_irqsave(&idle->pi_lock, flags); ++ raw_spin_lock(rq->lock); ++ idle->last_ran = rq->niffies; ++ time_slice_expired(idle, rq); ++ idle->state = TASK_RUNNING; ++ /* Setting prio to illegal value shouldn't matter when never queued */ ++ idle->prio = PRIO_LIMIT; ++ ++ kasan_unpoison_task_stack(idle); ++ ++#ifdef CONFIG_SMP ++ /* ++ * It's possible that init_idle() gets called multiple times on a task, ++ * in that case do_set_cpus_allowed() will not do the right thing. ++ * ++ * And since this is boot we can forgo the serialisation. ++ */ ++ set_cpus_allowed_common(idle, cpumask_of(cpu)); ++#ifdef CONFIG_SMT_NICE ++ idle->smt_bias = 0; ++#endif ++#endif ++ set_rq_task(rq, idle); ++ ++ /* Silence PROVE_RCU */ ++ rcu_read_lock(); ++ set_task_cpu(idle, cpu); ++ rcu_read_unlock(); ++ ++ rq->idle = idle; ++ rcu_assign_pointer(rq->curr, idle); ++ idle->on_rq = TASK_ON_RQ_QUEUED; ++ raw_spin_unlock(rq->lock); ++ raw_spin_unlock_irqrestore(&idle->pi_lock, flags); ++ ++ /* Set the preempt count _outside_ the spinlocks! */ ++ init_idle_preempt_count(idle, cpu); ++ ++ ftrace_graph_init_idle_task(idle, cpu); ++ vtime_init_idle(idle, cpu); ++#ifdef CONFIG_SMP ++ sprintf(idle->comm, "%s/%d", INIT_TASK_COMM, cpu); ++#endif ++} ++ ++int cpuset_cpumask_can_shrink(const struct cpumask __maybe_unused *cur, ++ const struct cpumask __maybe_unused *trial) ++{ ++ return 1; ++} ++ ++int task_can_attach(struct task_struct *p, ++ const struct cpumask *cs_cpus_allowed) ++{ ++ int ret = 0; ++ ++ /* ++ * Kthreads which disallow setaffinity shouldn't be moved ++ * to a new cpuset; we don't want to change their CPU ++ * affinity and isolating such threads by their set of ++ * allowed nodes is unnecessary. Thus, cpusets are not ++ * applicable for such threads. This prevents checking for ++ * success of set_cpus_allowed_ptr() on all attached tasks ++ * before cpus_mask may be changed. ++ */ ++ if (p->flags & PF_NO_SETAFFINITY) ++ ret = -EINVAL; ++ ++ return ret; ++} ++ ++void resched_cpu(int cpu) ++{ ++ struct rq *rq = cpu_rq(cpu); ++ struct rq_flags rf; ++ ++ rq_lock_irqsave(rq, &rf); ++ if (cpu_online(cpu) || cpu == smp_processor_id()) ++ resched_curr(rq); ++ rq_unlock_irqrestore(rq, &rf); ++} ++ ++#ifdef CONFIG_SMP ++#ifdef CONFIG_NO_HZ_COMMON ++void select_nohz_load_balancer(int stop_tick) ++{ ++} ++ ++void set_cpu_sd_state_idle(void) {} ++void nohz_balance_enter_idle(int cpu) {} ++ ++/* ++ * In the semi idle case, use the nearest busy CPU for migrating timers ++ * from an idle CPU. This is good for power-savings. ++ * ++ * We don't do similar optimization for completely idle system, as ++ * selecting an idle CPU will add more delays to the timers than intended ++ * (as that CPU's timer base may not be uptodate wrt jiffies etc). ++ */ ++int get_nohz_timer_target(void) ++{ ++ int i, cpu = smp_processor_id(); ++ struct sched_domain *sd; ++ ++ if (!idle_cpu(cpu) && housekeeping_cpu(cpu, HK_FLAG_TIMER)) ++ return cpu; ++ ++ rcu_read_lock(); ++ for_each_domain(cpu, sd) { ++ for_each_cpu(i, sched_domain_span(sd)) { ++ if (cpu == i) ++ continue; ++ ++ if (!idle_cpu(i) && housekeeping_cpu(i, HK_FLAG_TIMER)) { ++ cpu = i; ++ cpu = i; ++ goto unlock; ++ } ++ } ++ } ++ ++ if (!housekeeping_cpu(cpu, HK_FLAG_TIMER)) ++ cpu = housekeeping_any_cpu(HK_FLAG_TIMER); ++unlock: ++ rcu_read_unlock(); ++ return cpu; ++} ++ ++/* ++ * When add_timer_on() enqueues a timer into the timer wheel of an ++ * idle CPU then this timer might expire before the next timer event ++ * which is scheduled to wake up that CPU. In case of a completely ++ * idle system the next event might even be infinite time into the ++ * future. wake_up_idle_cpu() ensures that the CPU is woken up and ++ * leaves the inner idle loop so the newly added timer is taken into ++ * account when the CPU goes back to idle and evaluates the timer ++ * wheel for the next timer event. ++ */ ++void wake_up_idle_cpu(int cpu) ++{ ++ if (cpu == smp_processor_id()) ++ return; ++ ++ if (set_nr_and_not_polling(cpu_rq(cpu)->idle)) ++ smp_sched_reschedule(cpu); ++ else ++ trace_sched_wake_idle_without_ipi(cpu); ++} ++ ++static bool wake_up_full_nohz_cpu(int cpu) ++{ ++ /* ++ * We just need the target to call irq_exit() and re-evaluate ++ * the next tick. The nohz full kick at least implies that. ++ * If needed we can still optimize that later with an ++ * empty IRQ. ++ */ ++ if (cpu_is_offline(cpu)) ++ return true; /* Don't try to wake offline CPUs. */ ++ if (tick_nohz_full_cpu(cpu)) { ++ if (cpu != smp_processor_id() || ++ tick_nohz_tick_stopped()) ++ tick_nohz_full_kick_cpu(cpu); ++ return true; ++ } ++ ++ return false; ++} ++ ++/* ++ * Wake up the specified CPU. If the CPU is going offline, it is the ++ * caller's responsibility to deal with the lost wakeup, for example, ++ * by hooking into the CPU_DEAD notifier like timers and hrtimers do. ++ */ ++void wake_up_nohz_cpu(int cpu) ++{ ++ if (!wake_up_full_nohz_cpu(cpu)) ++ wake_up_idle_cpu(cpu); ++} ++#endif /* CONFIG_NO_HZ_COMMON */ ++ ++/* ++ * Change a given task's CPU affinity. Migrate the thread to a ++ * proper CPU and schedule it away if the CPU it's executing on ++ * is removed from the allowed bitmask. ++ * ++ * NOTE: the caller must have a valid reference to the task, the ++ * task must not exit() & deallocate itself prematurely. The ++ * call is not atomic; no spinlocks may be held. ++ */ ++static int __set_cpus_allowed_ptr(struct task_struct *p, ++ const struct cpumask *new_mask, bool check) ++{ ++ const struct cpumask *cpu_valid_mask = cpu_active_mask; ++ bool queued = false, running_wrong = false, kthread; ++ struct cpumask old_mask; ++ unsigned int dest_cpu; ++ struct rq_flags rf; ++ struct rq *rq; ++ int ret = 0; ++ ++ rq = task_rq_lock(p, &rf); ++ update_rq_clock(rq); ++ ++ kthread = !!(p->flags & PF_KTHREAD); ++ if (kthread) { ++ /* ++ * Kernel threads are allowed on online && !active CPUs ++ */ ++ cpu_valid_mask = cpu_online_mask; ++ } ++ ++ /* ++ * Must re-check here, to close a race against __kthread_bind(), ++ * sched_setaffinity() is not guaranteed to observe the flag. ++ */ ++ if (check && (p->flags & PF_NO_SETAFFINITY)) { ++ ret = -EINVAL; ++ goto out; ++ } ++ ++ cpumask_copy(&old_mask, p->cpus_ptr); ++ if (cpumask_equal(&old_mask, new_mask)) ++ goto out; ++ ++ dest_cpu = cpumask_any_and(cpu_valid_mask, new_mask); ++ if (dest_cpu >= nr_cpu_ids) { ++ ret = -EINVAL; ++ goto out; ++ } ++ ++ queued = task_queued(p); ++ __do_set_cpus_allowed(p, new_mask); ++ ++ if (kthread) { ++ /* ++ * For kernel threads that do indeed end up on online && ++ * !active we want to ensure they are strict per-CPU threads. ++ */ ++ WARN_ON(cpumask_intersects(new_mask, cpu_online_mask) && ++ !cpumask_intersects(new_mask, cpu_active_mask) && ++ p->nr_cpus_allowed != 1); ++ } ++ ++ /* Can the task run on the task's current CPU? If so, we're done */ ++ if (cpumask_test_cpu(task_cpu(p), new_mask)) ++ goto out; ++ ++ if (task_running(rq, p)) { ++ /* Task is running on the wrong cpu now, reschedule it. */ ++ if (rq == this_rq()) { ++ set_task_cpu(p, dest_cpu); ++ set_tsk_need_resched(p); ++ running_wrong = true; ++ } else ++ resched_task(p); ++ } else { ++ if (queued) { ++ /* ++ * Switch runqueue locks after dequeueing the task ++ * here while still holding the pi_lock to be holding ++ * the correct lock for enqueueing. ++ */ ++ dequeue_task(rq, p, 0); ++ rq_unlock(rq); ++ ++ rq = cpu_rq(dest_cpu); ++ rq_lock(rq); ++ } ++ set_task_cpu(p, dest_cpu); ++ if (queued) ++ enqueue_task(rq, p, 0); ++ } ++ if (queued) ++ try_preempt(p, rq); ++ if (running_wrong) ++ preempt_disable(); ++out: ++ task_rq_unlock(rq, p, &rf); ++ ++ if (running_wrong) { ++ __schedule(true); ++ preempt_enable(); ++ } ++ ++ return ret; ++} ++ ++int set_cpus_allowed_ptr(struct task_struct *p, const struct cpumask *new_mask) ++{ ++ return __set_cpus_allowed_ptr(p, new_mask, false); ++} ++EXPORT_SYMBOL_GPL(set_cpus_allowed_ptr); ++ ++#ifdef CONFIG_HOTPLUG_CPU ++/* ++ * Run through task list and find tasks affined to the dead cpu, then remove ++ * that cpu from the list, enable cpu0 and set the zerobound flag. Must hold ++ * cpu 0 and src_cpu's runqueue locks. We should be holding both rq lock and ++ * pi_lock to change cpus_mask but it's not going to matter here. ++ */ ++static void bind_zero(int src_cpu) ++{ ++ struct task_struct *p, *t; ++ struct rq *rq0; ++ int bound = 0; ++ ++ if (src_cpu == 0) ++ return; ++ ++ rq0 = cpu_rq(0); ++ ++ do_each_thread(t, p) { ++ if (cpumask_test_cpu(src_cpu, p->cpus_ptr)) { ++ bool local = (task_cpu(p) == src_cpu); ++ struct rq *rq = task_rq(p); ++ ++ /* task_running is the cpu stopper thread */ ++ if (local && task_running(rq, p)) ++ continue; ++ atomic_clear_cpu(src_cpu, &p->cpus_mask); ++ atomic_set_cpu(0, &p->cpus_mask); ++ p->zerobound = true; ++ bound++; ++ if (local) { ++ bool queued = task_queued(p); ++ ++ if (queued) ++ dequeue_task(rq, p, 0); ++ set_task_cpu(p, 0); ++ if (queued) ++ enqueue_task(rq0, p, 0); ++ } ++ } ++ } while_each_thread(t, p); ++ ++ if (bound) { ++ printk(KERN_INFO "MuQSS removed affinity for %d processes to cpu %d\n", ++ bound, src_cpu); ++ } ++} ++ ++/* Find processes with the zerobound flag and reenable their affinity for the ++ * CPU coming alive. */ ++static void unbind_zero(int src_cpu) ++{ ++ int unbound = 0, zerobound = 0; ++ struct task_struct *p, *t; ++ ++ if (src_cpu == 0) ++ return; ++ ++ do_each_thread(t, p) { ++ if (!p->mm) ++ p->zerobound = false; ++ if (p->zerobound) { ++ unbound++; ++ cpumask_set_cpu(src_cpu, &p->cpus_mask); ++ /* Once every CPU affinity has been re-enabled, remove ++ * the zerobound flag */ ++ if (cpumask_subset(cpu_possible_mask, p->cpus_ptr)) { ++ p->zerobound = false; ++ zerobound++; ++ } ++ } ++ } while_each_thread(t, p); ++ ++ if (unbound) { ++ printk(KERN_INFO "MuQSS added affinity for %d processes to cpu %d\n", ++ unbound, src_cpu); ++ } ++ if (zerobound) { ++ printk(KERN_INFO "MuQSS released forced binding to cpu0 for %d processes\n", ++ zerobound); ++ } ++} ++ ++/* ++ * Ensure that the idle task is using init_mm right before its cpu goes ++ * offline. ++ */ ++void idle_task_exit(void) ++{ ++ struct mm_struct *mm = current->active_mm; ++ ++ BUG_ON(cpu_online(smp_processor_id())); ++ ++ if (mm != &init_mm) { ++ switch_mm(mm, &init_mm, current); ++ current->active_mm = &init_mm; ++ finish_arch_post_lock_switch(); ++ } ++ mmdrop(mm); ++} ++#else /* CONFIG_HOTPLUG_CPU */ ++static void unbind_zero(int src_cpu) {} ++#endif /* CONFIG_HOTPLUG_CPU */ ++ ++void sched_set_stop_task(int cpu, struct task_struct *stop) ++{ ++ struct sched_param stop_param = { .sched_priority = STOP_PRIO }; ++ struct sched_param start_param = { .sched_priority = 0 }; ++ struct task_struct *old_stop = cpu_rq(cpu)->stop; ++ ++ if (stop) { ++ /* ++ * Make it appear like a SCHED_FIFO task, its something ++ * userspace knows about and won't get confused about. ++ * ++ * Also, it will make PI more or less work without too ++ * much confusion -- but then, stop work should not ++ * rely on PI working anyway. ++ */ ++ sched_setscheduler_nocheck(stop, SCHED_FIFO, &stop_param); ++ } ++ ++ cpu_rq(cpu)->stop = stop; ++ ++ if (old_stop) { ++ /* ++ * Reset it back to a normal scheduling policy so that ++ * it can die in pieces. ++ */ ++ sched_setscheduler_nocheck(old_stop, SCHED_NORMAL, &start_param); ++ } ++} ++ ++#if defined(CONFIG_SCHED_DEBUG) && defined(CONFIG_SYSCTL) ++ ++static struct ctl_table sd_ctl_dir[] = { ++ { ++ .procname = "sched_domain", ++ .mode = 0555, ++ }, ++ {} ++}; ++ ++static struct ctl_table sd_ctl_root[] = { ++ { ++ .procname = "kernel", ++ .mode = 0555, ++ .child = sd_ctl_dir, ++ }, ++ {} ++}; ++ ++static struct ctl_table *sd_alloc_ctl_entry(int n) ++{ ++ struct ctl_table *entry = ++ kcalloc(n, sizeof(struct ctl_table), GFP_KERNEL); ++ ++ return entry; ++} ++ ++static void sd_free_ctl_entry(struct ctl_table **tablep) ++{ ++ struct ctl_table *entry; ++ ++ /* ++ * In the intermediate directories, both the child directory and ++ * procname are dynamically allocated and could fail but the mode ++ * will always be set. In the lowest directory the names are ++ * static strings and all have proc handlers. ++ */ ++ for (entry = *tablep; entry->mode; entry++) { ++ if (entry->child) ++ sd_free_ctl_entry(&entry->child); ++ if (entry->proc_handler == NULL) ++ kfree(entry->procname); ++ } ++ ++ kfree(*tablep); ++ *tablep = NULL; ++} ++ ++static void ++set_table_entry(struct ctl_table *entry, ++ const char *procname, void *data, int maxlen, ++ umode_t mode, proc_handler *proc_handler) ++{ ++ entry->procname = procname; ++ entry->data = data; ++ entry->maxlen = maxlen; ++ entry->mode = mode; ++ entry->proc_handler = proc_handler; ++} ++ ++static struct ctl_table * ++sd_alloc_ctl_domain_table(struct sched_domain *sd) ++{ ++ struct ctl_table *table = sd_alloc_ctl_entry(9); ++ ++ if (table == NULL) ++ return NULL; ++ ++ set_table_entry(&table[0], "min_interval", &sd->min_interval, sizeof(long), 0644, proc_doulongvec_minmax); ++ set_table_entry(&table[1], "max_interval", &sd->max_interval, sizeof(long), 0644, proc_doulongvec_minmax); ++ set_table_entry(&table[2], "busy_factor", &sd->busy_factor, sizeof(int), 0644, proc_dointvec_minmax); ++ set_table_entry(&table[3], "imbalance_pct", &sd->imbalance_pct, sizeof(int), 0644, proc_dointvec_minmax); ++ set_table_entry(&table[4], "cache_nice_tries", &sd->cache_nice_tries, sizeof(int), 0644, proc_dointvec_minmax); ++ set_table_entry(&table[5], "flags", &sd->flags, sizeof(int), 0644, proc_dointvec_minmax); ++ set_table_entry(&table[6], "max_newidle_lb_cost", &sd->max_newidle_lb_cost, sizeof(long), 0644, proc_doulongvec_minmax); ++ set_table_entry(&table[7], "name", sd->name, CORENAME_MAX_SIZE, 0444, proc_dostring); ++ /* &table[8] is terminator */ ++ ++ return table; ++} ++ ++static struct ctl_table *sd_alloc_ctl_cpu_table(int cpu) ++{ ++ struct ctl_table *entry, *table; ++ struct sched_domain *sd; ++ int domain_num = 0, i; ++ char buf[32]; ++ ++ for_each_domain(cpu, sd) ++ domain_num++; ++ entry = table = sd_alloc_ctl_entry(domain_num + 1); ++ if (table == NULL) ++ return NULL; ++ ++ i = 0; ++ for_each_domain(cpu, sd) { ++ snprintf(buf, 32, "domain%d", i); ++ entry->procname = kstrdup(buf, GFP_KERNEL); ++ entry->mode = 0555; ++ entry->child = sd_alloc_ctl_domain_table(sd); ++ entry++; ++ i++; ++ } ++ return table; ++} ++ ++static cpumask_var_t sd_sysctl_cpus; ++static struct ctl_table_header *sd_sysctl_header; ++ ++void register_sched_domain_sysctl(void) ++{ ++ static struct ctl_table *cpu_entries; ++ static struct ctl_table **cpu_idx; ++ char buf[32]; ++ int i; ++ ++ if (!cpu_entries) { ++ cpu_entries = sd_alloc_ctl_entry(num_possible_cpus() + 1); ++ if (!cpu_entries) ++ return; ++ ++ WARN_ON(sd_ctl_dir[0].child); ++ sd_ctl_dir[0].child = cpu_entries; ++ } ++ ++ if (!cpu_idx) { ++ struct ctl_table *e = cpu_entries; ++ ++ cpu_idx = kcalloc(nr_cpu_ids, sizeof(struct ctl_table*), GFP_KERNEL); ++ if (!cpu_idx) ++ return; ++ ++ /* deal with sparse possible map */ ++ for_each_possible_cpu(i) { ++ cpu_idx[i] = e; ++ e++; ++ } ++ } ++ ++ if (!cpumask_available(sd_sysctl_cpus)) { ++ if (!alloc_cpumask_var(&sd_sysctl_cpus, GFP_KERNEL)) ++ return; ++ ++ /* init to possible to not have holes in @cpu_entries */ ++ cpumask_copy(sd_sysctl_cpus, cpu_possible_mask); ++ } ++ ++ for_each_cpu(i, sd_sysctl_cpus) { ++ struct ctl_table *e = cpu_idx[i]; ++ ++ if (e->child) ++ sd_free_ctl_entry(&e->child); ++ ++ if (!e->procname) { ++ snprintf(buf, 32, "cpu%d", i); ++ e->procname = kstrdup(buf, GFP_KERNEL); ++ } ++ e->mode = 0555; ++ e->child = sd_alloc_ctl_cpu_table(i); ++ ++ __cpumask_clear_cpu(i, sd_sysctl_cpus); ++ } ++ ++ WARN_ON(sd_sysctl_header); ++ sd_sysctl_header = register_sysctl_table(sd_ctl_root); ++} ++ ++void dirty_sched_domain_sysctl(int cpu) ++{ ++ if (cpumask_available(sd_sysctl_cpus)) ++ __cpumask_set_cpu(cpu, sd_sysctl_cpus); ++} ++ ++/* may be called multiple times per register */ ++void unregister_sched_domain_sysctl(void) ++{ ++ unregister_sysctl_table(sd_sysctl_header); ++ sd_sysctl_header = NULL; ++} ++#endif /* CONFIG_SYSCTL */ ++ ++void set_rq_online(struct rq *rq) ++{ ++ if (!rq->online) { ++ cpumask_set_cpu(cpu_of(rq), rq->rd->online); ++ rq->online = true; ++ } ++} ++ ++void set_rq_offline(struct rq *rq) ++{ ++ if (rq->online) { ++ int cpu = cpu_of(rq); ++ ++ cpumask_clear_cpu(cpu, rq->rd->online); ++ rq->online = false; ++ clear_cpuidle_map(cpu); ++ } ++} ++ ++/* ++ * used to mark begin/end of suspend/resume: ++ */ ++static int num_cpus_frozen; ++ ++/* ++ * Update cpusets according to cpu_active mask. If cpusets are ++ * disabled, cpuset_update_active_cpus() becomes a simple wrapper ++ * around partition_sched_domains(). ++ * ++ * If we come here as part of a suspend/resume, don't touch cpusets because we ++ * want to restore it back to its original state upon resume anyway. ++ */ ++static void cpuset_cpu_active(void) ++{ ++ if (cpuhp_tasks_frozen) { ++ /* ++ * num_cpus_frozen tracks how many CPUs are involved in suspend ++ * resume sequence. As long as this is not the last online ++ * operation in the resume sequence, just build a single sched ++ * domain, ignoring cpusets. ++ */ ++ partition_sched_domains(1, NULL, NULL); ++ if (--num_cpus_frozen) ++ return; ++ /* ++ * This is the last CPU online operation. So fall through and ++ * restore the original sched domains by considering the ++ * cpuset configurations. ++ */ ++ cpuset_force_rebuild(); ++ } ++ ++ cpuset_update_active_cpus(); ++} ++ ++static int cpuset_cpu_inactive(unsigned int cpu) ++{ ++ if (!cpuhp_tasks_frozen) { ++ cpuset_update_active_cpus(); ++ } else { ++ num_cpus_frozen++; ++ partition_sched_domains(1, NULL, NULL); ++ } ++ return 0; ++} ++ ++int sched_cpu_activate(unsigned int cpu) ++{ ++ struct rq *rq = cpu_rq(cpu); ++ struct rq_flags rf; ++ ++#ifdef CONFIG_SCHED_SMT ++ /* ++ * When going up, increment the number of cores with SMT present. ++ */ ++ if (cpumask_weight(cpu_smt_mask(cpu)) == 2) ++ static_branch_inc_cpuslocked(&sched_smt_present); ++#endif ++ set_cpu_active(cpu, true); ++ ++ if (sched_smp_initialized) { ++ sched_domains_numa_masks_set(cpu); ++ cpuset_cpu_active(); ++ } ++ ++ /* ++ * Put the rq online, if not already. This happens: ++ * ++ * 1) In the early boot process, because we build the real domains ++ * after all CPUs have been brought up. ++ * ++ * 2) At runtime, if cpuset_cpu_active() fails to rebuild the ++ * domains. ++ */ ++ rq_lock_irqsave(rq, &rf); ++ if (rq->rd) { ++ BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span)); ++ set_rq_online(rq); ++ } ++ unbind_zero(cpu); ++ rq_unlock_irqrestore(rq, &rf); ++ ++ return 0; ++} ++ ++int sched_cpu_deactivate(unsigned int cpu) ++{ ++ int ret; ++ ++ set_cpu_active(cpu, false); ++ /* ++ * We've cleared cpu_active_mask, wait for all preempt-disabled and RCU ++ * users of this state to go away such that all new such users will ++ * observe it. ++ * ++ * Do sync before park smpboot threads to take care the rcu boost case. ++ */ ++ synchronize_rcu(); ++ ++#ifdef CONFIG_SCHED_SMT ++ /* ++ * When going down, decrement the number of cores with SMT present. ++ */ ++ if (cpumask_weight(cpu_smt_mask(cpu)) == 2) ++ static_branch_dec_cpuslocked(&sched_smt_present); ++#endif ++ ++ if (!sched_smp_initialized) ++ return 0; ++ ++ ret = cpuset_cpu_inactive(cpu); ++ if (ret) { ++ set_cpu_active(cpu, true); ++ return ret; ++ } ++ sched_domains_numa_masks_clear(cpu); ++ return 0; ++} ++ ++int sched_cpu_starting(unsigned int cpu) ++{ ++ sched_tick_start(cpu); ++ return 0; ++} ++ ++#ifdef CONFIG_HOTPLUG_CPU ++int sched_cpu_dying(unsigned int cpu) ++{ ++ struct rq *rq = cpu_rq(cpu); ++ unsigned long flags; ++ ++ /* Handle pending wakeups and then migrate everything off */ ++ sched_ttwu_pending(); ++ sched_tick_stop(cpu); ++ ++ local_irq_save(flags); ++ double_rq_lock(rq, cpu_rq(0)); ++ if (rq->rd) { ++ BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span)); ++ set_rq_offline(rq); ++ } ++ bind_zero(cpu); ++ double_rq_unlock(rq, cpu_rq(0)); ++ sched_start_tick(rq, cpu); ++ hrexpiry_clear(rq); ++ local_irq_restore(flags); ++ ++ return 0; ++} ++#endif ++ ++#if defined(CONFIG_SCHED_SMT) || defined(CONFIG_SCHED_MC) ++/* ++ * Cheaper version of the below functions in case support for SMT and MC is ++ * compiled in but CPUs have no siblings. ++ */ ++static bool sole_cpu_idle(struct rq *rq) ++{ ++ return rq_idle(rq); ++} ++#endif ++#ifdef CONFIG_SCHED_SMT ++static const cpumask_t *thread_cpumask(int cpu) ++{ ++ return topology_sibling_cpumask(cpu); ++} ++/* All this CPU's SMT siblings are idle */ ++static bool siblings_cpu_idle(struct rq *rq) ++{ ++ return cpumask_subset(&rq->thread_mask, &cpu_idle_map); ++} ++#endif ++#ifdef CONFIG_SCHED_MC ++static const cpumask_t *core_cpumask(int cpu) ++{ ++ return topology_core_cpumask(cpu); ++} ++/* All this CPU's shared cache siblings are idle */ ++static bool cache_cpu_idle(struct rq *rq) ++{ ++ return cpumask_subset(&rq->core_mask, &cpu_idle_map); ++} ++/* MC siblings CPU mask which share the same LLC */ ++static const cpumask_t *llc_core_cpumask(int cpu) ++{ ++ return per_cpu(cpu_llc_shared_map, cpu); ++} ++#endif ++ ++enum sched_domain_level { ++ SD_LV_NONE = 0, ++ SD_LV_SIBLING, ++ SD_LV_MC, ++ SD_LV_BOOK, ++ SD_LV_CPU, ++ SD_LV_NODE, ++ SD_LV_ALLNODES, ++ SD_LV_MAX ++}; ++ ++void __init sched_init_smp(void) ++{ ++ struct rq *rq, *other_rq, *leader = cpu_rq(0); ++ struct sched_domain *sd; ++ int cpu, other_cpu, i; ++#ifdef CONFIG_SCHED_SMT ++ bool smt_threads = false; ++#endif ++ sched_init_numa(); ++ ++ /* ++ * There's no userspace yet to cause hotplug operations; hence all the ++ * cpu masks are stable and all blatant races in the below code cannot ++ * happen. ++ */ ++ mutex_lock(&sched_domains_mutex); ++ sched_init_domains(cpu_active_mask); ++ mutex_unlock(&sched_domains_mutex); ++ ++ /* Move init over to a non-isolated CPU */ ++ if (set_cpus_allowed_ptr(current, housekeeping_cpumask(HK_FLAG_DOMAIN)) < 0) ++ BUG(); ++ ++ local_irq_disable(); ++ mutex_lock(&sched_domains_mutex); ++ lock_all_rqs(); ++ ++ printk(KERN_INFO "MuQSS possible/present/online CPUs: %d/%d/%d\n", ++ num_possible_cpus(), num_present_cpus(), num_online_cpus()); ++ ++ /* ++ * Set up the relative cache distance of each online cpu from each ++ * other in a simple array for quick lookup. Locality is determined ++ * by the closest sched_domain that CPUs are separated by. CPUs with ++ * shared cache in SMT and MC are treated as local. Separate CPUs ++ * (within the same package or physically) within the same node are ++ * treated as not local. CPUs not even in the same domain (different ++ * nodes) are treated as very distant. ++ */ ++ for (cpu = 0; cpu < num_online_cpus(); cpu++) { ++ rq = cpu_rq(cpu); ++ leader = NULL; ++ /* First check if this cpu is in the same node */ ++ for_each_domain(cpu, sd) { ++ if (sd->level > SD_LV_MC) ++ continue; ++ if (rqshare != RQSHARE_ALL) ++ leader = NULL; ++ /* Set locality to local node if not already found lower */ ++ for_each_cpu(other_cpu, sched_domain_span(sd)) { ++ if (rqshare >= RQSHARE_SMP) { ++ other_rq = cpu_rq(other_cpu); ++ ++ /* Set the smp_leader to the first CPU */ ++ if (!leader) ++ leader = rq; ++ other_rq->smp_leader = leader; ++ } ++ if (rq->cpu_locality[other_cpu] > LOCALITY_SMP) ++ rq->cpu_locality[other_cpu] = LOCALITY_SMP; ++ } ++ } ++ ++ /* ++ * Each runqueue has its own function in case it doesn't have ++ * siblings of its own allowing mixed topologies. ++ */ ++#ifdef CONFIG_SCHED_MC ++ leader = NULL; ++ if (cpumask_weight(core_cpumask(cpu)) > 1) { ++ cpumask_copy(&rq->core_mask, llc_core_cpumask(cpu)); ++ cpumask_clear_cpu(cpu, &rq->core_mask); ++ for_each_cpu(other_cpu, core_cpumask(cpu)) { ++ if (rqshare == RQSHARE_MC || ++ (rqshare == RQSHARE_MC_LLC && cpumask_test_cpu(other_cpu, llc_core_cpumask(cpu)))) { ++ other_rq = cpu_rq(other_cpu); ++ ++ /* Set the mc_leader to the first CPU */ ++ if (!leader) ++ leader = rq; ++ other_rq->mc_leader = leader; ++ } ++ if (rq->cpu_locality[other_cpu] > LOCALITY_MC) { ++ /* this is to get LLC into play even in case LLC sharing is not used */ ++ if (cpumask_test_cpu(other_cpu, llc_core_cpumask(cpu))) ++ rq->cpu_locality[other_cpu] = LOCALITY_MC_LLC; ++ else ++ rq->cpu_locality[other_cpu] = LOCALITY_MC; ++ } ++ } ++ rq->cache_idle = cache_cpu_idle; ++ } ++#endif ++#ifdef CONFIG_SCHED_SMT ++ leader = NULL; ++ if (cpumask_weight(thread_cpumask(cpu)) > 1) { ++ cpumask_copy(&rq->thread_mask, thread_cpumask(cpu)); ++ cpumask_clear_cpu(cpu, &rq->thread_mask); ++ for_each_cpu(other_cpu, thread_cpumask(cpu)) { ++ if (rqshare == RQSHARE_SMT) { ++ other_rq = cpu_rq(other_cpu); ++ ++ /* Set the smt_leader to the first CPU */ ++ if (!leader) ++ leader = rq; ++ other_rq->smt_leader = leader; ++ } ++ if (rq->cpu_locality[other_cpu] > LOCALITY_SMT) ++ rq->cpu_locality[other_cpu] = LOCALITY_SMT; ++ } ++ rq->siblings_idle = siblings_cpu_idle; ++ smt_threads = true; ++ } ++#endif ++ } ++ ++#ifdef CONFIG_SMT_NICE ++ if (smt_threads) { ++ check_siblings = &check_smt_siblings; ++ wake_siblings = &wake_smt_siblings; ++ smt_schedule = &smt_should_schedule; ++ } ++#endif ++ unlock_all_rqs(); ++ mutex_unlock(&sched_domains_mutex); ++ ++ for_each_online_cpu(cpu) { ++ rq = cpu_rq(cpu); ++ for_each_online_cpu(other_cpu) { ++ printk(KERN_DEBUG "MuQSS locality CPU %d to %d: %d\n", cpu, other_cpu, rq->cpu_locality[other_cpu]); ++ } ++ } ++ ++ for_each_online_cpu(cpu) { ++ rq = cpu_rq(cpu); ++ leader = rq->smp_leader; ++ ++ rq_lock(rq); ++ if (leader && rq != leader) { ++ printk(KERN_INFO "MuQSS sharing SMP runqueue from CPU %d to CPU %d\n", ++ leader->cpu, rq->cpu); ++ kfree(rq->node); ++ kfree(rq->sl); ++ kfree(rq->lock); ++ rq->node = leader->node; ++ rq->sl = leader->sl; ++ rq->lock = leader->lock; ++ barrier(); ++ /* To make up for not unlocking the freed runlock */ ++ preempt_enable(); ++ } else ++ rq_unlock(rq); ++ } ++ ++#ifdef CONFIG_SCHED_MC ++ for_each_online_cpu(cpu) { ++ rq = cpu_rq(cpu); ++ leader = rq->mc_leader; ++ ++ rq_lock(rq); ++ if (leader && rq != leader) { ++ printk(KERN_INFO "MuQSS sharing MC runqueue from CPU %d to CPU %d\n", ++ leader->cpu, rq->cpu); ++ kfree(rq->node); ++ kfree(rq->sl); ++ kfree(rq->lock); ++ rq->node = leader->node; ++ rq->sl = leader->sl; ++ rq->lock = leader->lock; ++ barrier(); ++ /* To make up for not unlocking the freed runlock */ ++ preempt_enable(); ++ } else ++ rq_unlock(rq); ++ } ++#endif /* CONFIG_SCHED_MC */ ++ ++#ifdef CONFIG_SCHED_SMT ++ for_each_online_cpu(cpu) { ++ rq = cpu_rq(cpu); ++ ++ leader = rq->smt_leader; ++ ++ rq_lock(rq); ++ if (leader && rq != leader) { ++ printk(KERN_INFO "MuQSS sharing SMT runqueue from CPU %d to CPU %d\n", ++ leader->cpu, rq->cpu); ++ kfree(rq->node); ++ kfree(rq->sl); ++ kfree(rq->lock); ++ rq->node = leader->node; ++ rq->sl = leader->sl; ++ rq->lock = leader->lock; ++ barrier(); ++ /* To make up for not unlocking the freed runlock */ ++ preempt_enable(); ++ } else ++ rq_unlock(rq); ++ } ++#endif /* CONFIG_SCHED_SMT */ ++ ++ local_irq_enable(); ++ ++ total_runqueues = 0; ++ for_each_online_cpu(cpu) { ++ int locality, total_rqs = 0, total_cpus = 0; ++ ++ rq = cpu_rq(cpu); ++ if ( ++#ifdef CONFIG_SCHED_MC ++ (rq->mc_leader == rq) && ++#endif ++#ifdef CONFIG_SCHED_SMT ++ (rq->smt_leader == rq) && ++#endif ++ (rq->smp_leader == rq)) { ++ total_runqueues++; ++ } ++ ++ for (locality = LOCALITY_SAME; locality <= LOCALITY_DISTANT; locality++) { ++ int selected_cpus[NR_CPUS], selected_cpu_cnt, selected_cpu_idx, test_cpu_idx, cpu_idx, best_locality, test_cpu; ++ int ordered_cpus[NR_CPUS], ordered_cpus_idx; ++ ++ ordered_cpus_idx = -1; ++ selected_cpu_cnt = 0; ++ ++ for_each_online_cpu(test_cpu) { ++ if (cpu < num_online_cpus() / 2) ++ other_cpu = cpu + test_cpu; ++ else ++ other_cpu = cpu - test_cpu; ++ if (other_cpu < 0) ++ other_cpu += num_online_cpus(); ++ else ++ other_cpu %= num_online_cpus(); ++ /* gather CPUs of the same locality */ ++ if (rq->cpu_locality[other_cpu] == locality) { ++ selected_cpus[selected_cpu_cnt] = other_cpu; ++ selected_cpu_cnt++; ++ } ++ } ++ ++ /* reserve first CPU as starting point */ ++ if (selected_cpu_cnt > 0) { ++ ordered_cpus_idx++; ++ ordered_cpus[ordered_cpus_idx] = selected_cpus[ordered_cpus_idx]; ++ selected_cpus[ordered_cpus_idx] = -1; ++ } ++ ++ /* take each CPU and sort it within the same locality based on each inter-CPU localities */ ++ for(test_cpu_idx = 1; test_cpu_idx < selected_cpu_cnt; test_cpu_idx++) { ++ /* starting point with worst locality and current CPU */ ++ best_locality = LOCALITY_DISTANT; ++ selected_cpu_idx = test_cpu_idx; ++ ++ /* try to find the best locality within group */ ++ for(cpu_idx = 1; cpu_idx < selected_cpu_cnt; cpu_idx++) { ++ /* if CPU has not been used and locality is better */ ++ if (selected_cpus[cpu_idx] > -1) { ++ other_rq = cpu_rq(ordered_cpus[ordered_cpus_idx]); ++ if (best_locality > other_rq->cpu_locality[selected_cpus[cpu_idx]]) { ++ /* assign best locality and best CPU idx in array */ ++ best_locality = other_rq->cpu_locality[selected_cpus[cpu_idx]]; ++ selected_cpu_idx = cpu_idx; ++ } ++ } ++ } ++ ++ /* add our next best CPU to ordered list */ ++ ordered_cpus_idx++; ++ ordered_cpus[ordered_cpus_idx] = selected_cpus[selected_cpu_idx]; ++ /* mark this CPU as used */ ++ selected_cpus[selected_cpu_idx] = -1; ++ } ++ ++ /* set up RQ and CPU orders */ ++ for (test_cpu = 0; test_cpu <= ordered_cpus_idx; test_cpu++) { ++ other_rq = cpu_rq(ordered_cpus[test_cpu]); ++ /* set up cpu orders */ ++ rq->cpu_order[total_cpus++] = other_rq; ++ if ( ++#ifdef CONFIG_SCHED_MC ++ (other_rq->mc_leader == other_rq) && ++#endif ++#ifdef CONFIG_SCHED_SMT ++ (other_rq->smt_leader == other_rq) && ++#endif ++ (other_rq->smp_leader == other_rq)) { ++ /* set up RQ orders */ ++ rq->rq_order[total_rqs++] = other_rq; ++ } ++ } ++ } ++ } ++ ++ for_each_online_cpu(cpu) { ++ rq = cpu_rq(cpu); ++ for (i = 0; i < total_runqueues; i++) { ++ printk(KERN_DEBUG "MuQSS CPU %d llc %d RQ order %d RQ %d llc %d\n", cpu, per_cpu(cpu_llc_id, cpu), i, ++ rq->rq_order[i]->cpu, per_cpu(cpu_llc_id, rq->rq_order[i]->cpu)); ++ } ++ } ++ ++ for_each_online_cpu(cpu) { ++ rq = cpu_rq(cpu); ++ for (i = 0; i < num_online_cpus(); i++) { ++ printk(KERN_DEBUG "MuQSS CPU %d llc %d CPU order %d RQ %d llc %d\n", cpu, per_cpu(cpu_llc_id, cpu), i, ++ rq->cpu_order[i]->cpu, per_cpu(cpu_llc_id, rq->cpu_order[i]->cpu)); ++ } ++ } ++ ++ switch (rqshare) { ++ case RQSHARE_ALL: ++ /* This should only ever read 1 */ ++ printk(KERN_INFO "MuQSS runqueue share type ALL total runqueues: %d\n", ++ total_runqueues); ++ break; ++ case RQSHARE_SMP: ++ printk(KERN_INFO "MuQSS runqueue share type SMP total runqueues: %d\n", ++ total_runqueues); ++ break; ++ case RQSHARE_MC: ++ printk(KERN_INFO "MuQSS runqueue share type MC total runqueues: %d\n", ++ total_runqueues); ++ break; ++ case RQSHARE_MC_LLC: ++ printk(KERN_INFO "MuQSS runqueue share type LLC total runqueues: %d\n", ++ total_runqueues); ++ break; ++ case RQSHARE_SMT: ++ printk(KERN_INFO "MuQSS runqueue share type SMT total runqueues: %d\n", ++ total_runqueues); ++ break; ++ case RQSHARE_NONE: ++ printk(KERN_INFO "MuQSS runqueue share type NONE total runqueues: %d\n", ++ total_runqueues); ++ break; ++ } ++ ++ sched_smp_initialized = true; ++} ++#else ++void __init sched_init_smp(void) ++{ ++ sched_smp_initialized = true; ++} ++#endif /* CONFIG_SMP */ ++ ++int in_sched_functions(unsigned long addr) ++{ ++ return in_lock_functions(addr) || ++ (addr >= (unsigned long)__sched_text_start ++ && addr < (unsigned long)__sched_text_end); ++} ++ ++#ifdef CONFIG_CGROUP_SCHED ++/* task group related information */ ++struct task_group { ++ struct cgroup_subsys_state css; ++ ++ struct rcu_head rcu; ++ struct list_head list; ++ ++ struct task_group *parent; ++ struct list_head siblings; ++ struct list_head children; ++}; ++ ++/* ++ * Default task group. ++ * Every task in system belongs to this group at bootup. ++ */ ++struct task_group root_task_group; ++LIST_HEAD(task_groups); ++ ++/* Cacheline aligned slab cache for task_group */ ++static struct kmem_cache *task_group_cache __read_mostly; ++#endif /* CONFIG_CGROUP_SCHED */ ++ ++void __init sched_init(void) ++{ ++#ifdef CONFIG_SMP ++ int cpu_ids; ++#endif ++ int i; ++ struct rq *rq; ++ ++ wait_bit_init(); ++ ++ prio_ratios[0] = 128; ++ for (i = 1 ; i < NICE_WIDTH ; i++) ++ prio_ratios[i] = prio_ratios[i - 1] * 11 / 10; ++ ++ skiplist_node_init(&init_task.node); ++ ++#ifdef CONFIG_SMP ++ init_defrootdomain(); ++ cpumask_clear(&cpu_idle_map); ++#else ++ uprq = &per_cpu(runqueues, 0); ++#endif ++ ++#ifdef CONFIG_CGROUP_SCHED ++ task_group_cache = KMEM_CACHE(task_group, 0); ++ ++ list_add(&root_task_group.list, &task_groups); ++ INIT_LIST_HEAD(&root_task_group.children); ++ INIT_LIST_HEAD(&root_task_group.siblings); ++#endif /* CONFIG_CGROUP_SCHED */ ++ for_each_possible_cpu(i) { ++ rq = cpu_rq(i); ++ rq->node = kmalloc(sizeof(skiplist_node), GFP_ATOMIC); ++ skiplist_init(rq->node); ++ rq->sl = new_skiplist(rq->node); ++ rq->lock = kmalloc(sizeof(raw_spinlock_t), GFP_ATOMIC); ++ raw_spin_lock_init(rq->lock); ++ rq->nr_running = 0; ++ rq->nr_uninterruptible = 0; ++ rq->nr_switches = 0; ++ rq->clock = rq->old_clock = rq->last_niffy = rq->niffies = 0; ++ rq->last_jiffy = jiffies; ++ rq->user_ns = rq->nice_ns = rq->softirq_ns = rq->system_ns = ++ rq->iowait_ns = rq->idle_ns = 0; ++ rq->dither = 0; ++ set_rq_task(rq, &init_task); ++ rq->iso_ticks = 0; ++ rq->iso_refractory = false; ++#ifdef CONFIG_SMP ++ rq->smp_leader = rq; ++#ifdef CONFIG_SCHED_MC ++ rq->mc_leader = rq; ++#endif ++#ifdef CONFIG_SCHED_SMT ++ rq->smt_leader = rq; ++#endif ++ rq->sd = NULL; ++ rq->rd = NULL; ++ rq->online = false; ++ rq->cpu = i; ++ rq_attach_root(rq, &def_root_domain); ++#endif ++ init_rq_hrexpiry(rq); ++ atomic_set(&rq->nr_iowait, 0); ++ } ++ ++#ifdef CONFIG_SMP ++ cpu_ids = i; ++ /* ++ * Set the base locality for cpu cache distance calculation to ++ * "distant" (3). Make sure the distance from a CPU to itself is 0. ++ */ ++ for_each_possible_cpu(i) { ++ int j; ++ ++ rq = cpu_rq(i); ++#ifdef CONFIG_SCHED_SMT ++ rq->siblings_idle = sole_cpu_idle; ++#endif ++#ifdef CONFIG_SCHED_MC ++ rq->cache_idle = sole_cpu_idle; ++#endif ++ rq->cpu_locality = kmalloc(cpu_ids * sizeof(int *), GFP_ATOMIC); ++ for_each_possible_cpu(j) { ++ if (i == j) ++ rq->cpu_locality[j] = LOCALITY_SAME; ++ else ++ rq->cpu_locality[j] = LOCALITY_DISTANT; ++ } ++ rq->rq_order = kmalloc(cpu_ids * sizeof(struct rq *), GFP_ATOMIC); ++ rq->cpu_order = kmalloc(cpu_ids * sizeof(struct rq *), GFP_ATOMIC); ++ rq->rq_order[0] = rq->cpu_order[0] = rq; ++ for (j = 1; j < cpu_ids; j++) ++ rq->rq_order[j] = rq->cpu_order[j] = cpu_rq(j); ++ } ++#endif ++ ++ /* ++ * The boot idle thread does lazy MMU switching as well: ++ */ ++ mmgrab(&init_mm); ++ enter_lazy_tlb(&init_mm, current); ++ ++ /* ++ * Make us the idle thread. Technically, schedule() should not be ++ * called from this thread, however somewhere below it might be, ++ * but because we are the idle thread, we just pick up running again ++ * when this runqueue becomes "idle". ++ */ ++ init_idle(current, smp_processor_id()); ++ ++#ifdef CONFIG_SMP ++ idle_thread_set_boot_cpu(); ++#endif /* SMP */ ++ ++ init_schedstats(); ++ ++ psi_init(); ++} ++ ++#ifdef CONFIG_DEBUG_ATOMIC_SLEEP ++static inline int preempt_count_equals(int preempt_offset) ++{ ++ int nested = preempt_count() + rcu_preempt_depth(); ++ ++ return (nested == preempt_offset); ++} ++ ++void __might_sleep(const char *file, int line, int preempt_offset) ++{ ++ /* ++ * Blocking primitives will set (and therefore destroy) current->state, ++ * since we will exit with TASK_RUNNING make sure we enter with it, ++ * otherwise we will destroy state. ++ */ ++ WARN_ONCE(current->state != TASK_RUNNING && current->task_state_change, ++ "do not call blocking ops when !TASK_RUNNING; " ++ "state=%lx set at [<%p>] %pS\n", ++ current->state, ++ (void *)current->task_state_change, ++ (void *)current->task_state_change); ++ ++ ___might_sleep(file, line, preempt_offset); ++} ++EXPORT_SYMBOL(__might_sleep); ++ ++void __cant_sleep(const char *file, int line, int preempt_offset) ++{ ++ static unsigned long prev_jiffy; ++ ++ if (irqs_disabled()) ++ return; ++ ++ if (!IS_ENABLED(CONFIG_PREEMPT_COUNT)) ++ return; ++ ++ if (preempt_count() > preempt_offset) ++ return; ++ ++ if (time_before(jiffies, prev_jiffy + HZ) && prev_jiffy) ++ return; ++ prev_jiffy = jiffies; ++ ++ printk(KERN_ERR "BUG: assuming atomic context at %s:%d\n", file, line); ++ printk(KERN_ERR "in_atomic(): %d, irqs_disabled(): %d, pid: %d, name: %s\n", ++ in_atomic(), irqs_disabled(), ++ current->pid, current->comm); ++ ++ debug_show_held_locks(current); ++ dump_stack(); ++ add_taint(TAINT_WARN, LOCKDEP_STILL_OK); ++} ++EXPORT_SYMBOL_GPL(__cant_sleep); ++ ++void ___might_sleep(const char *file, int line, int preempt_offset) ++{ ++ /* Ratelimiting timestamp: */ ++ static unsigned long prev_jiffy; ++ ++ unsigned long preempt_disable_ip; ++ ++ /* WARN_ON_ONCE() by default, no rate limit required: */ ++ rcu_sleep_check(); ++ ++ if ((preempt_count_equals(preempt_offset) && !irqs_disabled() && ++ !is_idle_task(current) && !current->non_block_count) || ++ system_state == SYSTEM_BOOTING || system_state > SYSTEM_RUNNING || ++ oops_in_progress) ++ return; ++ ++ if (time_before(jiffies, prev_jiffy + HZ) && prev_jiffy) ++ return; ++ prev_jiffy = jiffies; ++ ++ /* Save this before calling printk(), since that will clobber it: */ ++ preempt_disable_ip = get_preempt_disable_ip(current); ++ ++ printk(KERN_ERR ++ "BUG: sleeping function called from invalid context at %s:%d\n", ++ file, line); ++ printk(KERN_ERR ++ "in_atomic(): %d, irqs_disabled(): %d, non_block: %d, pid: %d, name: %s\n", ++ in_atomic(), irqs_disabled(), current->non_block_count, ++ current->pid, current->comm); ++ ++ if (task_stack_end_corrupted(current)) ++ printk(KERN_EMERG "Thread overran stack, or stack corrupted\n"); ++ ++ debug_show_held_locks(current); ++ if (irqs_disabled()) ++ print_irqtrace_events(current); ++ if (IS_ENABLED(CONFIG_DEBUG_PREEMPT) ++ && !preempt_count_equals(preempt_offset)) { ++ pr_err("Preemption disabled at:"); ++ print_ip_sym(preempt_disable_ip); ++ pr_cont("\n"); ++ } ++ dump_stack(); ++ add_taint(TAINT_WARN, LOCKDEP_STILL_OK); ++} ++EXPORT_SYMBOL(___might_sleep); ++#endif ++ ++#ifdef CONFIG_MAGIC_SYSRQ ++static inline void normalise_rt_tasks(void) ++{ ++ struct sched_attr attr = {}; ++ struct task_struct *g, *p; ++ struct rq_flags rf; ++ struct rq *rq; ++ ++ read_lock(&tasklist_lock); ++ for_each_process_thread(g, p) { ++ /* ++ * Only normalize user tasks: ++ */ ++ if (p->flags & PF_KTHREAD) ++ continue; ++ ++ if (!rt_task(p) && !iso_task(p)) ++ continue; ++ ++ rq = task_rq_lock(p, &rf); ++ __setscheduler(p, rq, SCHED_NORMAL, 0, &attr, false); ++ task_rq_unlock(rq, p, &rf); ++ } ++ read_unlock(&tasklist_lock); ++} ++ ++void normalize_rt_tasks(void) ++{ ++ normalise_rt_tasks(); ++} ++#endif /* CONFIG_MAGIC_SYSRQ */ ++ ++#if defined(CONFIG_IA64) || defined(CONFIG_KGDB_KDB) ++/* ++ * These functions are only useful for the IA64 MCA handling, or kdb. ++ * ++ * They can only be called when the whole system has been ++ * stopped - every CPU needs to be quiescent, and no scheduling ++ * activity can take place. Using them for anything else would ++ * be a serious bug, and as a result, they aren't even visible ++ * under any other configuration. ++ */ ++ ++/** ++ * curr_task - return the current task for a given CPU. ++ * @cpu: the processor in question. ++ * ++ * ONLY VALID WHEN THE WHOLE SYSTEM IS STOPPED! ++ * ++ * Return: The current task for @cpu. ++ */ ++struct task_struct *curr_task(int cpu) ++{ ++ return cpu_curr(cpu); ++} ++ ++#endif /* defined(CONFIG_IA64) || defined(CONFIG_KGDB_KDB) */ ++ ++#ifdef CONFIG_IA64 ++/** ++ * ia64_set_curr_task - set the current task for a given CPU. ++ * @cpu: the processor in question. ++ * @p: the task pointer to set. ++ * ++ * Description: This function must only be used when non-maskable interrupts ++ * are serviced on a separate stack. It allows the architecture to switch the ++ * notion of the current task on a CPU in a non-blocking manner. This function ++ * must be called with all CPU's synchronised, and interrupts disabled, the ++ * and caller must save the original value of the current task (see ++ * curr_task() above) and restore that value before reenabling interrupts and ++ * re-starting the system. ++ * ++ * ONLY VALID WHEN THE WHOLE SYSTEM IS STOPPED! ++ */ ++void ia64_set_curr_task(int cpu, struct task_struct *p) ++{ ++ cpu_curr(cpu) = p; ++} ++ ++#endif ++ ++void init_idle_bootup_task(struct task_struct *idle) ++{} ++ ++#ifdef CONFIG_SCHED_DEBUG ++__read_mostly bool sched_debug_enabled; ++ ++void proc_sched_show_task(struct task_struct *p, struct pid_namespace *ns, ++ struct seq_file *m) ++{ ++ seq_printf(m, "%s (%d, #threads: %d)\n", p->comm, task_pid_nr_ns(p, ns), ++ get_nr_threads(p)); ++} ++ ++void proc_sched_set_task(struct task_struct *p) ++{} ++#endif ++ ++#ifdef CONFIG_CGROUP_SCHED ++static void sched_free_group(struct task_group *tg) ++{ ++ kmem_cache_free(task_group_cache, tg); ++} ++ ++/* allocate runqueue etc for a new task group */ ++struct task_group *sched_create_group(struct task_group *parent) ++{ ++ struct task_group *tg; ++ ++ tg = kmem_cache_alloc(task_group_cache, GFP_KERNEL | __GFP_ZERO); ++ if (!tg) ++ return ERR_PTR(-ENOMEM); ++ ++ return tg; ++} ++ ++void sched_online_group(struct task_group *tg, struct task_group *parent) ++{ ++} ++ ++/* rcu callback to free various structures associated with a task group */ ++static void sched_free_group_rcu(struct rcu_head *rhp) ++{ ++ /* Now it should be safe to free those cfs_rqs */ ++ sched_free_group(container_of(rhp, struct task_group, rcu)); ++} ++ ++void sched_destroy_group(struct task_group *tg) ++{ ++ /* Wait for possible concurrent references to cfs_rqs complete */ ++ call_rcu(&tg->rcu, sched_free_group_rcu); ++} ++ ++void sched_offline_group(struct task_group *tg) ++{ ++} ++ ++static inline struct task_group *css_tg(struct cgroup_subsys_state *css) ++{ ++ return css ? container_of(css, struct task_group, css) : NULL; ++} ++ ++static struct cgroup_subsys_state * ++cpu_cgroup_css_alloc(struct cgroup_subsys_state *parent_css) ++{ ++ struct task_group *parent = css_tg(parent_css); ++ struct task_group *tg; ++ ++ if (!parent) { ++ /* This is early initialization for the top cgroup */ ++ return &root_task_group.css; ++ } ++ ++ tg = sched_create_group(parent); ++ if (IS_ERR(tg)) ++ return ERR_PTR(-ENOMEM); ++ return &tg->css; ++} ++ ++/* Expose task group only after completing cgroup initialization */ ++static int cpu_cgroup_css_online(struct cgroup_subsys_state *css) ++{ ++ struct task_group *tg = css_tg(css); ++ struct task_group *parent = css_tg(css->parent); ++ ++ if (parent) ++ sched_online_group(tg, parent); ++ return 0; ++} ++ ++static void cpu_cgroup_css_released(struct cgroup_subsys_state *css) ++{ ++ struct task_group *tg = css_tg(css); ++ ++ sched_offline_group(tg); ++} ++ ++static void cpu_cgroup_css_free(struct cgroup_subsys_state *css) ++{ ++ struct task_group *tg = css_tg(css); ++ ++ /* ++ * Relies on the RCU grace period between css_released() and this. ++ */ ++ sched_free_group(tg); ++} ++ ++static void cpu_cgroup_fork(struct task_struct *task) ++{ ++} ++ ++static int cpu_cgroup_can_attach(struct cgroup_taskset *tset) ++{ ++ return 0; ++} ++ ++static void cpu_cgroup_attach(struct cgroup_taskset *tset) ++{ ++} ++ ++static struct cftype cpu_legacy_files[] = { ++ { } /* Terminate */ ++}; ++ ++static struct cftype cpu_files[] = { ++ { } /* terminate */ ++}; ++ ++static int cpu_extra_stat_show(struct seq_file *sf, ++ struct cgroup_subsys_state *css) ++{ ++ return 0; ++} ++ ++struct cgroup_subsys cpu_cgrp_subsys = { ++ .css_alloc = cpu_cgroup_css_alloc, ++ .css_online = cpu_cgroup_css_online, ++ .css_released = cpu_cgroup_css_released, ++ .css_free = cpu_cgroup_css_free, ++ .css_extra_stat_show = cpu_extra_stat_show, ++ .fork = cpu_cgroup_fork, ++ .can_attach = cpu_cgroup_can_attach, ++ .attach = cpu_cgroup_attach, ++ .legacy_cftypes = cpu_files, ++ .legacy_cftypes = cpu_legacy_files, ++ .dfl_cftypes = cpu_files, ++ .early_init = true, ++ .threaded = true, ++}; ++#endif /* CONFIG_CGROUP_SCHED */ ++ ++#undef CREATE_TRACE_POINTS +diff --git a/kernel/sched/MuQSS.h b/kernel/sched/MuQSS.h +new file mode 100644 +index 000000000000..cf655482df00 +--- /dev/null ++++ b/kernel/sched/MuQSS.h +@@ -0,0 +1,1012 @@ ++/* SPDX-License-Identifier: GPL-2.0 */ ++#ifndef MUQSS_SCHED_H ++#define MUQSS_SCHED_H ++ ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++ ++#include ++ ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++ ++#ifdef CONFIG_PARAVIRT ++#include ++#endif ++ ++#include "cpupri.h" ++ ++#ifdef CONFIG_SCHED_DEBUG ++# define SCHED_WARN_ON(x) WARN_ONCE(x, #x) ++#else ++# define SCHED_WARN_ON(x) ((void)(x)) ++#endif ++ ++/* ++ * wake flags ++ */ ++#define WF_SYNC 0x01 /* waker goes to sleep after wakeup */ ++#define WF_FORK 0x02 /* child wakeup after fork */ ++#define WF_MIGRATED 0x04 /* internal use, task got migrated */ ++ ++/* task_struct::on_rq states: */ ++#define TASK_ON_RQ_QUEUED 1 ++#define TASK_ON_RQ_MIGRATING 2 ++ ++struct rq; ++ ++#ifdef CONFIG_SMP ++ ++static inline bool sched_asym_prefer(int a, int b) ++{ ++ return arch_asym_cpu_priority(a) > arch_asym_cpu_priority(b); ++} ++ ++struct perf_domain { ++ struct em_perf_domain *em_pd; ++ struct perf_domain *next; ++ struct rcu_head rcu; ++}; ++ ++/* Scheduling group status flags */ ++#define SG_OVERLOAD 0x1 /* More than one runnable task on a CPU. */ ++#define SG_OVERUTILIZED 0x2 /* One or more CPUs are over-utilized. */ ++ ++/* ++ * We add the notion of a root-domain which will be used to define per-domain ++ * variables. Each exclusive cpuset essentially defines an island domain by ++ * fully partitioning the member cpus from any other cpuset. Whenever a new ++ * exclusive cpuset is created, we also create and attach a new root-domain ++ * object. ++ * ++ */ ++struct root_domain { ++ atomic_t refcount; ++ atomic_t rto_count; ++ struct rcu_head rcu; ++ cpumask_var_t span; ++ cpumask_var_t online; ++ ++ /* ++ * Indicate pullable load on at least one CPU, e.g: ++ * - More than one runnable task ++ * - Running task is misfit ++ */ ++ int overload; ++ ++ /* Indicate one or more cpus over-utilized (tipping point) */ ++ int overutilized; ++ ++ /* ++ * The bit corresponding to a CPU gets set here if such CPU has more ++ * than one runnable -deadline task (as it is below for RT tasks). ++ */ ++ cpumask_var_t dlo_mask; ++ atomic_t dlo_count; ++ /* Replace unused CFS structures with void */ ++ //struct dl_bw dl_bw; ++ //struct cpudl cpudl; ++ void *dl_bw; ++ void *cpudl; ++ ++ /* ++ * The "RT overload" flag: it gets set if a CPU has more than ++ * one runnable RT task. ++ */ ++ cpumask_var_t rto_mask; ++ //struct cpupri cpupri; ++ void *cpupri; ++ ++ unsigned long max_cpu_capacity; ++ ++ /* ++ * NULL-terminated list of performance domains intersecting with the ++ * CPUs of the rd. Protected by RCU. ++ */ ++ struct perf_domain *pd; ++}; ++ ++extern void init_defrootdomain(void); ++extern int sched_init_domains(const struct cpumask *cpu_map); ++extern void rq_attach_root(struct rq *rq, struct root_domain *rd); ++ ++static inline void cpupri_cleanup(void __maybe_unused *cpupri) ++{ ++} ++ ++static inline void cpudl_cleanup(void __maybe_unused *cpudl) ++{ ++} ++ ++static inline void init_dl_bw(void __maybe_unused *dl_bw) ++{ ++} ++ ++static inline int cpudl_init(void __maybe_unused *dl_bw) ++{ ++ return 0; ++} ++ ++static inline int cpupri_init(void __maybe_unused *cpupri) ++{ ++ return 0; ++} ++#endif /* CONFIG_SMP */ ++ ++/* ++ * This is the main, per-CPU runqueue data structure. ++ * This data should only be modified by the local cpu. ++ */ ++struct rq { ++ raw_spinlock_t *lock; ++ raw_spinlock_t *orig_lock; ++ ++ struct task_struct *curr, *idle, *stop; ++ struct mm_struct *prev_mm; ++ ++ unsigned int nr_running; ++ /* ++ * This is part of a global counter where only the total sum ++ * over all CPUs matters. A task can increase this counter on ++ * one CPU and if it got migrated afterwards it may decrease ++ * it on another CPU. Always updated under the runqueue lock: ++ */ ++ unsigned long nr_uninterruptible; ++ u64 nr_switches; ++ ++ /* Stored data about rq->curr to work outside rq lock */ ++ u64 rq_deadline; ++ int rq_prio; ++ ++ /* Best queued id for use outside lock */ ++ u64 best_key; ++ ++ unsigned long last_scheduler_tick; /* Last jiffy this RQ ticked */ ++ unsigned long last_jiffy; /* Last jiffy this RQ updated rq clock */ ++ u64 niffies; /* Last time this RQ updated rq clock */ ++ u64 last_niffy; /* Last niffies as updated by local clock */ ++ u64 last_jiffy_niffies; /* Niffies @ last_jiffy */ ++ ++ u64 load_update; /* When we last updated load */ ++ unsigned long load_avg; /* Rolling load average */ ++#ifdef CONFIG_HAVE_SCHED_AVG_IRQ ++ u64 irq_load_update; /* When we last updated IRQ load */ ++ unsigned long irq_load_avg; /* Rolling IRQ load average */ ++#endif ++#ifdef CONFIG_SMT_NICE ++ struct mm_struct *rq_mm; ++ int rq_smt_bias; /* Policy/nice level bias across smt siblings */ ++#endif ++ /* Accurate timekeeping data */ ++ unsigned long user_ns, nice_ns, irq_ns, softirq_ns, system_ns, ++ iowait_ns, idle_ns; ++ atomic_t nr_iowait; ++ ++#ifdef CONFIG_MEMBARRIER ++ int membarrier_state; ++#endif ++ ++ skiplist_node *node; ++ skiplist *sl; ++#ifdef CONFIG_SMP ++ struct task_struct *preempt; /* Preempt triggered on this task */ ++ struct task_struct *preempting; /* Hint only, what task is preempting */ ++ ++ int cpu; /* cpu of this runqueue */ ++ bool online; ++ ++ struct root_domain *rd; ++ struct sched_domain *sd; ++ ++ unsigned long cpu_capacity_orig; ++ ++ int *cpu_locality; /* CPU relative cache distance */ ++ struct rq **rq_order; /* Shared RQs ordered by relative cache distance */ ++ struct rq **cpu_order; /* RQs of discrete CPUs ordered by distance */ ++ ++ struct rq *smp_leader; /* First physical CPU per node */ ++#ifdef CONFIG_SCHED_SMT ++ struct rq *smt_leader; /* First logical CPU in SMT siblings */ ++ cpumask_t thread_mask; ++ bool (*siblings_idle)(struct rq *rq); ++ /* See if all smt siblings are idle */ ++#endif /* CONFIG_SCHED_SMT */ ++#ifdef CONFIG_SCHED_MC ++ struct rq *mc_leader; /* First logical CPU in MC siblings */ ++ cpumask_t core_mask; ++ bool (*cache_idle)(struct rq *rq); ++ /* See if all cache siblings are idle */ ++#endif /* CONFIG_SCHED_MC */ ++#endif /* CONFIG_SMP */ ++#ifdef CONFIG_IRQ_TIME_ACCOUNTING ++ u64 prev_irq_time; ++#endif /* CONFIG_IRQ_TIME_ACCOUNTING */ ++#ifdef CONFIG_PARAVIRT ++ u64 prev_steal_time; ++#endif /* CONFIG_PARAVIRT */ ++#ifdef CONFIG_PARAVIRT_TIME_ACCOUNTING ++ u64 prev_steal_time_rq; ++#endif /* CONFIG_PARAVIRT_TIME_ACCOUNTING */ ++ ++ u64 clock, old_clock, last_tick; ++ /* Ensure that all clocks are in the same cache line */ ++ u64 clock_task ____cacheline_aligned; ++ int dither; ++ ++ int iso_ticks; ++ bool iso_refractory; ++ ++#ifdef CONFIG_HIGH_RES_TIMERS ++ struct hrtimer hrexpiry_timer; ++#endif ++ ++ int rt_nr_running; /* Number real time tasks running */ ++#ifdef CONFIG_SCHEDSTATS ++ ++ /* latency stats */ ++ struct sched_info rq_sched_info; ++ unsigned long long rq_cpu_time; ++ /* could above be rq->cfs_rq.exec_clock + rq->rt_rq.rt_runtime ? */ ++ ++ /* sys_sched_yield() stats */ ++ unsigned int yld_count; ++ ++ /* schedule() stats */ ++ unsigned int sched_switch; ++ unsigned int sched_count; ++ unsigned int sched_goidle; ++ ++ /* try_to_wake_up() stats */ ++ unsigned int ttwu_count; ++ unsigned int ttwu_local; ++#endif /* CONFIG_SCHEDSTATS */ ++ ++#ifdef CONFIG_SMP ++ struct llist_head wake_list; ++#endif ++ ++#ifdef CONFIG_CPU_IDLE ++ /* Must be inspected within a rcu lock section */ ++ struct cpuidle_state *idle_state; ++#endif ++}; ++ ++struct rq_flags { ++ unsigned long flags; ++}; ++ ++#ifdef CONFIG_SMP ++struct rq *cpu_rq(int cpu); ++#endif ++ ++#ifndef CONFIG_SMP ++extern struct rq *uprq; ++#define cpu_rq(cpu) (uprq) ++#define this_rq() (uprq) ++#define raw_rq() (uprq) ++#define task_rq(p) (uprq) ++#define cpu_curr(cpu) ((uprq)->curr) ++#else /* CONFIG_SMP */ ++DECLARE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues); ++#define this_rq() this_cpu_ptr(&runqueues) ++#define raw_rq() raw_cpu_ptr(&runqueues) ++#define task_rq(p) cpu_rq(task_cpu(p)) ++#endif /* CONFIG_SMP */ ++ ++static inline int task_current(struct rq *rq, struct task_struct *p) ++{ ++ return rq->curr == p; ++} ++ ++static inline int task_running(struct rq *rq, struct task_struct *p) ++{ ++#ifdef CONFIG_SMP ++ return p->on_cpu; ++#else ++ return task_current(rq, p); ++#endif ++} ++ ++static inline int task_on_rq_queued(struct task_struct *p) ++{ ++ return p->on_rq == TASK_ON_RQ_QUEUED; ++} ++ ++static inline int task_on_rq_migrating(struct task_struct *p) ++{ ++ return READ_ONCE(p->on_rq) == TASK_ON_RQ_MIGRATING; ++} ++ ++static inline void rq_lock(struct rq *rq) ++ __acquires(rq->lock) ++{ ++ raw_spin_lock(rq->lock); ++} ++ ++static inline void rq_unlock(struct rq *rq) ++ __releases(rq->lock) ++{ ++ raw_spin_unlock(rq->lock); ++} ++ ++static inline void rq_lock_irq(struct rq *rq) ++ __acquires(rq->lock) ++{ ++ raw_spin_lock_irq(rq->lock); ++} ++ ++static inline void rq_unlock_irq(struct rq *rq, struct rq_flags __always_unused *rf) ++ __releases(rq->lock) ++{ ++ raw_spin_unlock_irq(rq->lock); ++} ++ ++static inline void rq_lock_irqsave(struct rq *rq, struct rq_flags *rf) ++ __acquires(rq->lock) ++{ ++ raw_spin_lock_irqsave(rq->lock, rf->flags); ++} ++ ++static inline void rq_unlock_irqrestore(struct rq *rq, struct rq_flags *rf) ++ __releases(rq->lock) ++{ ++ raw_spin_unlock_irqrestore(rq->lock, rf->flags); ++} ++ ++static inline struct rq *task_rq_lock(struct task_struct *p, struct rq_flags *rf) ++ __acquires(p->pi_lock) ++ __acquires(rq->lock) ++{ ++ struct rq *rq; ++ ++ while (42) { ++ raw_spin_lock_irqsave(&p->pi_lock, rf->flags); ++ rq = task_rq(p); ++ raw_spin_lock(rq->lock); ++ if (likely(rq == task_rq(p))) ++ break; ++ raw_spin_unlock(rq->lock); ++ raw_spin_unlock_irqrestore(&p->pi_lock, rf->flags); ++ } ++ return rq; ++} ++ ++static inline void task_rq_unlock(struct rq *rq, struct task_struct *p, struct rq_flags *rf) ++ __releases(rq->lock) ++ __releases(p->pi_lock) ++{ ++ rq_unlock(rq); ++ raw_spin_unlock_irqrestore(&p->pi_lock, rf->flags); ++} ++ ++static inline struct rq *__task_rq_lock(struct task_struct *p, struct rq_flags __always_unused *rf) ++ __acquires(rq->lock) ++{ ++ struct rq *rq; ++ ++ lockdep_assert_held(&p->pi_lock); ++ ++ while (42) { ++ rq = task_rq(p); ++ raw_spin_lock(rq->lock); ++ if (likely(rq == task_rq(p))) ++ break; ++ raw_spin_unlock(rq->lock); ++ } ++ return rq; ++} ++ ++static inline void __task_rq_unlock(struct rq *rq, struct rq_flags __always_unused *rf) ++{ ++ rq_unlock(rq); ++} ++ ++static inline struct rq * ++this_rq_lock_irq(struct rq_flags *rf) ++ __acquires(rq->lock) ++{ ++ struct rq *rq; ++ ++ local_irq_disable(); ++ rq = this_rq(); ++ rq_lock(rq); ++ return rq; ++} ++ ++/* ++ * {de,en}queue flags: Most not used on MuQSS. ++ * ++ * DEQUEUE_SLEEP - task is no longer runnable ++ * ENQUEUE_WAKEUP - task just became runnable ++ * ++ * SAVE/RESTORE - an otherwise spurious dequeue/enqueue, done to ensure tasks ++ * are in a known state which allows modification. Such pairs ++ * should preserve as much state as possible. ++ * ++ * MOVE - paired with SAVE/RESTORE, explicitly does not preserve the location ++ * in the runqueue. ++ * ++ * ENQUEUE_HEAD - place at front of runqueue (tail if not specified) ++ * ENQUEUE_REPLENISH - CBS (replenish runtime and postpone deadline) ++ * ENQUEUE_MIGRATED - the task was migrated during wakeup ++ * ++ */ ++ ++#define DEQUEUE_SLEEP 0x01 ++#define DEQUEUE_SAVE 0x02 /* matches ENQUEUE_RESTORE */ ++ ++#define ENQUEUE_WAKEUP 0x01 ++#define ENQUEUE_RESTORE 0x02 ++ ++#ifdef CONFIG_SMP ++#define ENQUEUE_MIGRATED 0x40 ++#else ++#define ENQUEUE_MIGRATED 0x00 ++#endif ++ ++static inline u64 __rq_clock_broken(struct rq *rq) ++{ ++ return READ_ONCE(rq->clock); ++} ++ ++static inline u64 rq_clock(struct rq *rq) ++{ ++ lockdep_assert_held(rq->lock); ++ ++ return rq->clock; ++} ++ ++static inline u64 rq_clock_task(struct rq *rq) ++{ ++ lockdep_assert_held(rq->lock); ++ ++ return rq->clock_task; ++} ++ ++#ifdef CONFIG_NUMA ++enum numa_topology_type { ++ NUMA_DIRECT, ++ NUMA_GLUELESS_MESH, ++ NUMA_BACKPLANE, ++}; ++extern enum numa_topology_type sched_numa_topology_type; ++extern int sched_max_numa_distance; ++extern bool find_numa_distance(int distance); ++extern void sched_init_numa(void); ++extern void sched_domains_numa_masks_set(unsigned int cpu); ++extern void sched_domains_numa_masks_clear(unsigned int cpu); ++extern int sched_numa_find_closest(const struct cpumask *cpus, int cpu); ++#else ++static inline void sched_init_numa(void) { } ++static inline void sched_domains_numa_masks_set(unsigned int cpu) { } ++static inline void sched_domains_numa_masks_clear(unsigned int cpu) { } ++static inline int sched_numa_find_closest(const struct cpumask *cpus, int cpu) ++{ ++ return nr_cpu_ids; ++} ++#endif ++ ++extern struct mutex sched_domains_mutex; ++extern struct static_key_false sched_schedstats; ++ ++#define rcu_dereference_check_sched_domain(p) \ ++ rcu_dereference_check((p), \ ++ lockdep_is_held(&sched_domains_mutex)) ++ ++#ifdef CONFIG_SMP ++ ++/* ++ * The domain tree (rq->sd) is protected by RCU's quiescent state transition. ++ * See destroy_sched_domains: call_rcu for details. ++ * ++ * The domain tree of any CPU may only be accessed from within ++ * preempt-disabled sections. ++ */ ++#define for_each_domain(cpu, __sd) \ ++ for (__sd = rcu_dereference_check_sched_domain(cpu_rq(cpu)->sd); \ ++ __sd; __sd = __sd->parent) ++ ++#define for_each_lower_domain(sd) for (; sd; sd = sd->child) ++ ++/** ++ * highest_flag_domain - Return highest sched_domain containing flag. ++ * @cpu: The cpu whose highest level of sched domain is to ++ * be returned. ++ * @flag: The flag to check for the highest sched_domain ++ * for the given cpu. ++ * ++ * Returns the highest sched_domain of a cpu which contains the given flag. ++ */ ++static inline struct sched_domain *highest_flag_domain(int cpu, int flag) ++{ ++ struct sched_domain *sd, *hsd = NULL; ++ ++ for_each_domain(cpu, sd) { ++ if (!(sd->flags & flag)) ++ break; ++ hsd = sd; ++ } ++ ++ return hsd; ++} ++ ++static inline struct sched_domain *lowest_flag_domain(int cpu, int flag) ++{ ++ struct sched_domain *sd; ++ ++ for_each_domain(cpu, sd) { ++ if (sd->flags & flag) ++ break; ++ } ++ ++ return sd; ++} ++ ++DECLARE_PER_CPU(struct sched_domain *, sd_llc); ++DECLARE_PER_CPU(int, sd_llc_size); ++DECLARE_PER_CPU(int, sd_llc_id); ++DECLARE_PER_CPU(struct sched_domain_shared *, sd_llc_shared); ++DECLARE_PER_CPU(struct sched_domain *, sd_numa); ++DECLARE_PER_CPU(struct sched_domain *, sd_asym_packing); ++DECLARE_PER_CPU(struct sched_domain *, sd_asym_cpucapacity); ++ ++struct sched_group_capacity { ++ atomic_t ref; ++ /* ++ * CPU capacity of this group, SCHED_CAPACITY_SCALE being max capacity ++ * for a single CPU. ++ */ ++ unsigned long capacity; ++ unsigned long min_capacity; /* Min per-CPU capacity in group */ ++ unsigned long max_capacity; /* Max per-CPU capacity in group */ ++ unsigned long next_update; ++ int imbalance; /* XXX unrelated to capacity but shared group state */ ++ ++#ifdef CONFIG_SCHED_DEBUG ++ int id; ++#endif ++ ++ unsigned long cpumask[0]; /* balance mask */ ++}; ++ ++struct sched_group { ++ struct sched_group *next; /* Must be a circular list */ ++ atomic_t ref; ++ ++ unsigned int group_weight; ++ struct sched_group_capacity *sgc; ++ int asym_prefer_cpu; /* cpu of highest priority in group */ ++ ++ /* ++ * The CPUs this group covers. ++ * ++ * NOTE: this field is variable length. (Allocated dynamically ++ * by attaching extra space to the end of the structure, ++ * depending on how many CPUs the kernel has booted up with) ++ */ ++ unsigned long cpumask[0]; ++}; ++ ++static inline struct cpumask *sched_group_span(struct sched_group *sg) ++{ ++ return to_cpumask(sg->cpumask); ++} ++ ++/* ++ * See build_balance_mask(). ++ */ ++static inline struct cpumask *group_balance_mask(struct sched_group *sg) ++{ ++ return to_cpumask(sg->sgc->cpumask); ++} ++ ++/** ++ * group_first_cpu - Returns the first cpu in the cpumask of a sched_group. ++ * @group: The group whose first cpu is to be returned. ++ */ ++static inline unsigned int group_first_cpu(struct sched_group *group) ++{ ++ return cpumask_first(sched_group_span(group)); ++} ++ ++ ++#if defined(CONFIG_SCHED_DEBUG) && defined(CONFIG_SYSCTL) ++void register_sched_domain_sysctl(void); ++void dirty_sched_domain_sysctl(int cpu); ++void unregister_sched_domain_sysctl(void); ++#else ++static inline void register_sched_domain_sysctl(void) ++{ ++} ++static inline void dirty_sched_domain_sysctl(int cpu) ++{ ++} ++static inline void unregister_sched_domain_sysctl(void) ++{ ++} ++#endif ++ ++extern void sched_ttwu_pending(void); ++extern void set_cpus_allowed_common(struct task_struct *p, const struct cpumask *new_mask); ++extern void set_rq_online (struct rq *rq); ++extern void set_rq_offline(struct rq *rq); ++extern bool sched_smp_initialized; ++ ++static inline void update_group_capacity(struct sched_domain *sd, int cpu) ++{ ++} ++ ++static inline void trigger_load_balance(struct rq *rq) ++{ ++} ++ ++#define sched_feat(x) 0 ++ ++#else /* CONFIG_SMP */ ++ ++static inline void sched_ttwu_pending(void) { } ++ ++#endif /* CONFIG_SMP */ ++ ++#ifdef CONFIG_CPU_IDLE ++static inline void idle_set_state(struct rq *rq, ++ struct cpuidle_state *idle_state) ++{ ++ rq->idle_state = idle_state; ++} ++ ++static inline struct cpuidle_state *idle_get_state(struct rq *rq) ++{ ++ SCHED_WARN_ON(!rcu_read_lock_held()); ++ return rq->idle_state; ++} ++#else ++static inline void idle_set_state(struct rq *rq, ++ struct cpuidle_state *idle_state) ++{ ++} ++ ++static inline struct cpuidle_state *idle_get_state(struct rq *rq) ++{ ++ return NULL; ++} ++#endif ++ ++#ifdef CONFIG_SCHED_DEBUG ++extern bool sched_debug_enabled; ++#endif ++ ++extern void schedule_idle(void); ++ ++#ifdef CONFIG_IRQ_TIME_ACCOUNTING ++struct irqtime { ++ u64 total; ++ u64 tick_delta; ++ u64 irq_start_time; ++ struct u64_stats_sync sync; ++}; ++ ++DECLARE_PER_CPU(struct irqtime, cpu_irqtime); ++ ++/* ++ * Returns the irqtime minus the softirq time computed by ksoftirqd. ++ * Otherwise ksoftirqd's sum_exec_runtime is substracted its own runtime ++ * and never move forward. ++ */ ++static inline u64 irq_time_read(int cpu) ++{ ++ struct irqtime *irqtime = &per_cpu(cpu_irqtime, cpu); ++ unsigned int seq; ++ u64 total; ++ ++ do { ++ seq = __u64_stats_fetch_begin(&irqtime->sync); ++ total = irqtime->total; ++ } while (__u64_stats_fetch_retry(&irqtime->sync, seq)); ++ ++ return total; ++} ++#endif /* CONFIG_IRQ_TIME_ACCOUNTING */ ++ ++static inline bool sched_stop_runnable(struct rq *rq) ++{ ++ return rq->stop && task_on_rq_queued(rq->stop); ++} ++ ++#ifdef CONFIG_SMP ++static inline int cpu_of(struct rq *rq) ++{ ++ return rq->cpu; ++} ++#else /* CONFIG_SMP */ ++static inline int cpu_of(struct rq *rq) ++{ ++ return 0; ++} ++#endif ++ ++#ifdef CONFIG_CPU_FREQ ++DECLARE_PER_CPU(struct update_util_data *, cpufreq_update_util_data); ++ ++static inline void cpufreq_trigger(struct rq *rq, unsigned int flags) ++{ ++ struct update_util_data *data; ++ ++ data = rcu_dereference_sched(*per_cpu_ptr(&cpufreq_update_util_data, ++ cpu_of(rq))); ++ ++ if (data) ++ data->func(data, rq->niffies, flags); ++} ++#else ++static inline void cpufreq_trigger(struct rq *rq, unsigned int flag) ++{ ++} ++#endif /* CONFIG_CPU_FREQ */ ++ ++static __always_inline ++unsigned int uclamp_util_with(struct rq __maybe_unused *rq, unsigned int util, ++ struct task_struct __maybe_unused *p) ++{ ++ return util; ++} ++ ++static inline unsigned int uclamp_util(struct rq *rq, unsigned int util) ++{ ++ return util; ++} ++ ++#ifdef arch_scale_freq_capacity ++#ifndef arch_scale_freq_invariant ++#define arch_scale_freq_invariant() (true) ++#endif ++#else /* arch_scale_freq_capacity */ ++#define arch_scale_freq_invariant() (false) ++#endif ++ ++/* ++ * This should only be called when current == rq->idle. Dodgy workaround for ++ * when softirqs are pending and we are in the idle loop. Setting current to ++ * resched will kick us out of the idle loop and the softirqs will be serviced ++ * on our next pass through schedule(). ++ */ ++static inline bool softirq_pending(int cpu) ++{ ++ if (likely(!local_softirq_pending())) ++ return false; ++ set_tsk_need_resched(current); ++ return true; ++} ++ ++#ifdef CONFIG_64BIT ++static inline u64 read_sum_exec_runtime(struct task_struct *t) ++{ ++ return tsk_seruntime(t); ++} ++#else ++static inline u64 read_sum_exec_runtime(struct task_struct *t) ++{ ++ struct rq_flags rf; ++ u64 ns; ++ struct rq *rq; ++ ++ rq = task_rq_lock(t, &rf); ++ ns = tsk_seruntime(t); ++ task_rq_unlock(rq, t, &rf); ++ ++ return ns; ++} ++#endif ++ ++#ifndef arch_scale_freq_capacity ++static __always_inline ++unsigned long arch_scale_freq_capacity(int cpu) ++{ ++ return SCHED_CAPACITY_SCALE; ++} ++#endif ++ ++#ifdef CONFIG_NO_HZ_FULL ++extern bool sched_can_stop_tick(struct rq *rq); ++extern int __init sched_tick_offload_init(void); ++ ++/* ++ * Tick may be needed by tasks in the runqueue depending on their policy and ++ * requirements. If tick is needed, lets send the target an IPI to kick it out of ++ * nohz mode if necessary. ++ */ ++static inline void sched_update_tick_dependency(struct rq *rq) ++{ ++ int cpu; ++ ++ if (!tick_nohz_full_enabled()) ++ return; ++ ++ cpu = cpu_of(rq); ++ ++ if (!tick_nohz_full_cpu(cpu)) ++ return; ++ ++ if (sched_can_stop_tick(rq)) ++ tick_nohz_dep_clear_cpu(cpu, TICK_DEP_BIT_SCHED); ++ else ++ tick_nohz_dep_set_cpu(cpu, TICK_DEP_BIT_SCHED); ++} ++#else ++static inline int sched_tick_offload_init(void) { return 0; } ++static inline void sched_update_tick_dependency(struct rq *rq) { } ++#endif ++ ++#define SCHED_FLAG_SUGOV 0x10000000 ++ ++static inline bool rt_rq_is_runnable(struct rq *rt_rq) ++{ ++ return rt_rq->rt_nr_running; ++} ++ ++/** ++ * enum schedutil_type - CPU utilization type ++ * @FREQUENCY_UTIL: Utilization used to select frequency ++ * @ENERGY_UTIL: Utilization used during energy calculation ++ * ++ * The utilization signals of all scheduling classes (CFS/RT/DL) and IRQ time ++ * need to be aggregated differently depending on the usage made of them. This ++ * enum is used within schedutil_freq_util() to differentiate the types of ++ * utilization expected by the callers, and adjust the aggregation accordingly. ++ */ ++enum schedutil_type { ++ FREQUENCY_UTIL, ++ ENERGY_UTIL, ++}; ++ ++#ifdef CONFIG_CPU_FREQ_GOV_SCHEDUTIL ++ ++unsigned long schedutil_cpu_util(int cpu, unsigned long util_cfs, ++ unsigned long max, enum schedutil_type type, ++ struct task_struct *p); ++ ++static inline unsigned long cpu_bw_dl(struct rq *rq) ++{ ++ return 0; ++} ++ ++static inline unsigned long cpu_util_dl(struct rq *rq) ++{ ++ return 0; ++} ++ ++static inline unsigned long cpu_util_cfs(struct rq *rq) ++{ ++ unsigned long ret = READ_ONCE(rq->load_avg); ++ ++ if (ret > SCHED_CAPACITY_SCALE) ++ ret = SCHED_CAPACITY_SCALE; ++ return ret; ++} ++ ++static inline unsigned long cpu_util_rt(struct rq *rq) ++{ ++ unsigned long ret = READ_ONCE(rq->rt_nr_running); ++ ++ if (ret > SCHED_CAPACITY_SCALE) ++ ret = SCHED_CAPACITY_SCALE; ++ return ret; ++} ++ ++#ifdef CONFIG_HAVE_SCHED_AVG_IRQ ++static inline unsigned long cpu_util_irq(struct rq *rq) ++{ ++ unsigned long ret = READ_ONCE(rq->irq_load_avg); ++ ++ if (ret > SCHED_CAPACITY_SCALE) ++ ret = SCHED_CAPACITY_SCALE; ++ return ret; ++} ++ ++static inline ++unsigned long scale_irq_capacity(unsigned long util, unsigned long irq, unsigned long max) ++{ ++ util *= (max - irq); ++ util /= max; ++ ++ return util; ++ ++} ++#else ++static inline unsigned long cpu_util_irq(struct rq *rq) ++{ ++ return 0; ++} ++ ++static inline ++unsigned long scale_irq_capacity(unsigned long util, unsigned long irq, unsigned long max) ++{ ++ return util; ++} ++#endif ++#endif ++ ++#if defined(CONFIG_ENERGY_MODEL) && defined(CONFIG_CPU_FREQ_GOV_SCHEDUTIL) ++#define perf_domain_span(pd) (to_cpumask(((pd)->em_pd->cpus))) ++ ++DECLARE_STATIC_KEY_FALSE(sched_energy_present); ++ ++static inline bool sched_energy_enabled(void) ++{ ++ return static_branch_unlikely(&sched_energy_present); ++} ++ ++#else /* ! (CONFIG_ENERGY_MODEL && CONFIG_CPU_FREQ_GOV_SCHEDUTIL) */ ++ ++#define perf_domain_span(pd) NULL ++static inline bool sched_energy_enabled(void) { return false; } ++ ++#endif /* CONFIG_ENERGY_MODEL && CONFIG_CPU_FREQ_GOV_SCHEDUTIL */ ++ ++#ifdef CONFIG_MEMBARRIER ++/* ++ * The scheduler provides memory barriers required by membarrier between: ++ * - prior user-space memory accesses and store to rq->membarrier_state, ++ * - store to rq->membarrier_state and following user-space memory accesses. ++ * In the same way it provides those guarantees around store to rq->curr. ++ */ ++static inline void membarrier_switch_mm(struct rq *rq, ++ struct mm_struct *prev_mm, ++ struct mm_struct *next_mm) ++{ ++ int membarrier_state; ++ ++ if (prev_mm == next_mm) ++ return; ++ ++ membarrier_state = atomic_read(&next_mm->membarrier_state); ++ if (READ_ONCE(rq->membarrier_state) == membarrier_state) ++ return; ++ ++ WRITE_ONCE(rq->membarrier_state, membarrier_state); ++} ++#else ++static inline void membarrier_switch_mm(struct rq *rq, ++ struct mm_struct *prev_mm, ++ struct mm_struct *next_mm) ++{ ++} ++#endif ++ ++#endif /* MUQSS_SCHED_H */ +diff --git a/kernel/sched/cpufreq_schedutil.c b/kernel/sched/cpufreq_schedutil.c +index 9b8916fd00a2..5769f533f6d2 100644 +--- a/kernel/sched/cpufreq_schedutil.c ++++ b/kernel/sched/cpufreq_schedutil.c +@@ -183,6 +183,12 @@ static unsigned int get_next_freq(struct sugov_policy *sg_policy, + return cpufreq_driver_resolve_freq(policy, freq); + } + ++#ifdef CONFIG_SCHED_MUQSS ++#define rt_rq_runnable(rq_rt) rt_rq_is_runnable(rq) ++#else ++#define rt_rq_runnable(rq_rt) rt_rq_is_runnable(&rq->rt) ++#endif ++ + /* + * This function computes an effective utilization for the given CPU, to be + * used for frequency selection given the linear relation: f = u * f_max. +@@ -211,7 +217,7 @@ unsigned long schedutil_cpu_util(int cpu, unsigned long util_cfs, + struct rq *rq = cpu_rq(cpu); + + if (!IS_BUILTIN(CONFIG_UCLAMP_TASK) && +- type == FREQUENCY_UTIL && rt_rq_is_runnable(&rq->rt)) { ++ type == FREQUENCY_UTIL && rt_rq_runnable(rq)) { + return max; + } + +@@ -656,7 +662,11 @@ static int sugov_kthread_create(struct sugov_policy *sg_policy) + struct task_struct *thread; + struct sched_attr attr = { + .size = sizeof(struct sched_attr), ++#ifdef CONFIG_SCHED_MUQSS ++ .sched_policy = SCHED_RR, ++#else + .sched_policy = SCHED_DEADLINE, ++#endif + .sched_flags = SCHED_FLAG_SUGOV, + .sched_nice = 0, + .sched_priority = 0, +diff --git a/kernel/sched/cpupri.h b/kernel/sched/cpupri.h +index 7dc20a3232e7..e733a0a53b0a 100644 +--- a/kernel/sched/cpupri.h ++++ b/kernel/sched/cpupri.h +@@ -17,9 +17,11 @@ struct cpupri { + int *cpu_to_pri; + }; + ++#ifndef CONFIG_SCHED_MUQSS + #ifdef CONFIG_SMP + int cpupri_find(struct cpupri *cp, struct task_struct *p, struct cpumask *lowest_mask); + void cpupri_set(struct cpupri *cp, int cpu, int pri); + int cpupri_init(struct cpupri *cp); + void cpupri_cleanup(struct cpupri *cp); + #endif ++#endif +diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c +index d43318a489f2..16b0866ac3b6 100644 +--- a/kernel/sched/cputime.c ++++ b/kernel/sched/cputime.c +@@ -266,26 +266,6 @@ static inline u64 account_other_time(u64 max) + return accounted; + } + +-#ifdef CONFIG_64BIT +-static inline u64 read_sum_exec_runtime(struct task_struct *t) +-{ +- return t->se.sum_exec_runtime; +-} +-#else +-static u64 read_sum_exec_runtime(struct task_struct *t) +-{ +- u64 ns; +- struct rq_flags rf; +- struct rq *rq; +- +- rq = task_rq_lock(t, &rf); +- ns = t->se.sum_exec_runtime; +- task_rq_unlock(rq, t, &rf); +- +- return ns; +-} +-#endif +- + /* + * Accumulate raw cputime values of dead tasks (sig->[us]time) and live + * tasks (sum on group iteration) belonging to @tsk's group. +@@ -661,7 +641,7 @@ void cputime_adjust(struct task_cputime *curr, struct prev_cputime *prev, + void task_cputime_adjusted(struct task_struct *p, u64 *ut, u64 *st) + { + struct task_cputime cputime = { +- .sum_exec_runtime = p->se.sum_exec_runtime, ++ .sum_exec_runtime = tsk_seruntime(p), + }; + + task_cputime(p, &cputime.utime, &cputime.stime); +diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c +index ffa959e91227..ad4d17a51c0b 100644 +--- a/kernel/sched/idle.c ++++ b/kernel/sched/idle.c +@@ -231,6 +231,8 @@ static void cpuidle_idle_call(void) + static void do_idle(void) + { + int cpu = smp_processor_id(); ++ bool pending = false; ++ + /* + * If the arch has a polling bit, we maintain an invariant: + * +@@ -241,7 +243,10 @@ static void do_idle(void) + */ + + __current_set_polling(); +- tick_nohz_idle_enter(); ++ if (unlikely(softirq_pending(cpu))) ++ pending = true; ++ else ++ tick_nohz_idle_enter(); + + while (!need_resched()) { + rmb(); +@@ -279,7 +284,8 @@ static void do_idle(void) + * an IPI to fold the state for us. + */ + preempt_set_need_resched(); +- tick_nohz_idle_exit(); ++ if (!pending) ++ tick_nohz_idle_exit(); + __current_clr_polling(); + + /* +@@ -361,6 +367,7 @@ void cpu_startup_entry(enum cpuhp_state state) + do_idle(); + } + ++#ifndef CONFIG_SCHED_MUQSS + /* + * idle-task scheduling class. + */ +@@ -481,3 +488,4 @@ const struct sched_class idle_sched_class = { + .switched_to = switched_to_idle, + .update_curr = update_curr_idle, + }; ++#endif /* CONFIG_SCHED_MUQSS */ +diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h +index 280a3c735935..70e7f77f7691 100644 +--- a/kernel/sched/sched.h ++++ b/kernel/sched/sched.h +@@ -2,6 +2,19 @@ + /* + * Scheduler internal types and methods: + */ ++#ifdef CONFIG_SCHED_MUQSS ++#include "MuQSS.h" ++ ++/* Begin compatibility wrappers for MuQSS/CFS differences */ ++#define rq_rt_nr_running(rq) ((rq)->rt_nr_running) ++#define rq_h_nr_running(rq) ((rq)->nr_running) ++ ++#else /* CONFIG_SCHED_MUQSS */ ++ ++#define rq_rt_nr_running(rq) ((rq)->rt.rt_nr_running) ++#define rq_h_nr_running(rq) ((rq)->cfs.h_nr_running) ++ ++ + #include + + #include +@@ -2487,3 +2500,30 @@ static inline void membarrier_switch_mm(struct rq *rq, + { + } + #endif ++ ++/* MuQSS compatibility functions */ ++static inline bool softirq_pending(int cpu) ++{ ++ return false; ++} ++ ++#ifdef CONFIG_64BIT ++static inline u64 read_sum_exec_runtime(struct task_struct *t) ++{ ++ return t->se.sum_exec_runtime; ++} ++#else ++static inline u64 read_sum_exec_runtime(struct task_struct *t) ++{ ++ u64 ns; ++ struct rq_flags rf; ++ struct rq *rq; ++ ++ rq = task_rq_lock(t, &rf); ++ ns = t->se.sum_exec_runtime; ++ task_rq_unlock(rq, t, &rf); ++ ++ return ns; ++} ++#endif ++#endif /* CONFIG_SCHED_MUQSS */ +diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c +index 6ec1e595b1d4..7261ad3ec264 100644 +--- a/kernel/sched/topology.c ++++ b/kernel/sched/topology.c +@@ -3,6 +3,7 @@ + * Scheduler topology setup/handling methods + */ + #include "sched.h" ++#include "linux/sched/deadline.h" + + DEFINE_MUTEX(sched_domains_mutex); + +@@ -442,7 +443,11 @@ void rq_attach_root(struct rq *rq, struct root_domain *rd) + struct root_domain *old_rd = NULL; + unsigned long flags; + ++#ifdef CONFIG_SCHED_MUQSS ++ raw_spin_lock_irqsave(rq->lock, flags); ++#else + raw_spin_lock_irqsave(&rq->lock, flags); ++#endif + + if (rq->rd) { + old_rd = rq->rd; +@@ -468,7 +473,11 @@ void rq_attach_root(struct rq *rq, struct root_domain *rd) + if (cpumask_test_cpu(rq->cpu, cpu_active_mask)) + set_rq_online(rq); + ++#ifdef CONFIG_SCHED_MUQSS ++ raw_spin_unlock_irqrestore(rq->lock, flags); ++#else + raw_spin_unlock_irqrestore(&rq->lock, flags); ++#endif + + if (old_rd) + call_rcu(&old_rd->rcu, free_rootdomain); +diff --git a/kernel/skip_list.c b/kernel/skip_list.c +new file mode 100644 +index 000000000000..bf5c6e97e139 +--- /dev/null ++++ b/kernel/skip_list.c +@@ -0,0 +1,148 @@ ++/* ++ Copyright (C) 2011,2016 Con Kolivas. ++ ++ Code based on example originally by William Pugh. ++ ++Skip Lists are a probabilistic alternative to balanced trees, as ++described in the June 1990 issue of CACM and were invented by ++William Pugh in 1987. ++ ++A couple of comments about this implementation: ++The routine randomLevel has been hard-coded to generate random ++levels using p=0.25. It can be easily changed. ++ ++The insertion routine has been implemented so as to use the ++dirty hack described in the CACM paper: if a random level is ++generated that is more than the current maximum level, the ++current maximum level plus one is used instead. ++ ++Levels start at zero and go up to MaxLevel (which is equal to ++MaxNumberOfLevels-1). ++ ++The routines defined in this file are: ++ ++init: defines slnode ++ ++new_skiplist: returns a new, empty list ++ ++randomLevel: Returns a random level based on a u64 random seed passed to it. ++In MuQSS, the "niffy" time is used for this purpose. ++ ++insert(l,key, value): inserts the binding (key, value) into l. This operation ++occurs in O(log n) time. ++ ++delnode(slnode, l, node): deletes any binding of key from the l based on the ++actual node value. This operation occurs in O(k) time where k is the ++number of levels of the node in question (max 8). The original delete ++function occurred in O(log n) time and involved a search. ++ ++MuQSS Notes: In this implementation of skiplists, there are bidirectional ++next/prev pointers and the insert function returns a pointer to the actual ++node the value is stored. The key here is chosen by the scheduler so as to ++sort tasks according to the priority list requirements and is no longer used ++by the scheduler after insertion. The scheduler lookup, however, occurs in ++O(1) time because it is always the first item in the level 0 linked list. ++Since the task struct stores a copy of the node pointer upon skiplist_insert, ++it can also remove it much faster than the original implementation with the ++aid of prev<->next pointer manipulation and no searching. ++ ++*/ ++ ++#include ++#include ++ ++#define MaxNumberOfLevels 8 ++#define MaxLevel (MaxNumberOfLevels - 1) ++ ++void skiplist_init(skiplist_node *slnode) ++{ ++ int i; ++ ++ slnode->key = 0xFFFFFFFFFFFFFFFF; ++ slnode->level = 0; ++ slnode->value = NULL; ++ for (i = 0; i < MaxNumberOfLevels; i++) ++ slnode->next[i] = slnode->prev[i] = slnode; ++} ++ ++skiplist *new_skiplist(skiplist_node *slnode) ++{ ++ skiplist *l = kzalloc(sizeof(skiplist), GFP_ATOMIC); ++ ++ BUG_ON(!l); ++ l->header = slnode; ++ return l; ++} ++ ++void free_skiplist(skiplist *l) ++{ ++ skiplist_node *p, *q; ++ ++ p = l->header; ++ do { ++ q = p->next[0]; ++ p->next[0]->prev[0] = q->prev[0]; ++ skiplist_node_init(p); ++ p = q; ++ } while (p != l->header); ++ kfree(l); ++} ++ ++void skiplist_node_init(skiplist_node *node) ++{ ++ memset(node, 0, sizeof(skiplist_node)); ++} ++ ++static inline unsigned int randomLevel(const long unsigned int randseed) ++{ ++ return find_first_bit(&randseed, MaxLevel) / 2; ++} ++ ++void skiplist_insert(skiplist *l, skiplist_node *node, keyType key, valueType value, unsigned int randseed) ++{ ++ skiplist_node *update[MaxNumberOfLevels]; ++ skiplist_node *p, *q; ++ int k = l->level; ++ ++ p = l->header; ++ do { ++ while (q = p->next[k], q->key <= key) ++ p = q; ++ update[k] = p; ++ } while (--k >= 0); ++ ++ ++l->entries; ++ k = randomLevel(randseed); ++ if (k > l->level) { ++ k = ++l->level; ++ update[k] = l->header; ++ } ++ ++ node->level = k; ++ node->key = key; ++ node->value = value; ++ do { ++ p = update[k]; ++ node->next[k] = p->next[k]; ++ p->next[k] = node; ++ node->prev[k] = p; ++ node->next[k]->prev[k] = node; ++ } while (--k >= 0); ++} ++ ++void skiplist_delete(skiplist *l, skiplist_node *node) ++{ ++ int k, m = node->level; ++ ++ for (k = 0; k <= m; k++) { ++ node->prev[k]->next[k] = node->next[k]; ++ node->next[k]->prev[k] = node->prev[k]; ++ } ++ skiplist_node_init(node); ++ if (m == l->level) { ++ while (l->header->next[m] == l->header && l->header->prev[m] == l->header && m > 0) ++ m--; ++ l->level = m; ++ } ++ l->entries--; ++} +diff --git a/kernel/sysctl.c b/kernel/sysctl.c +index 70665934d53e..0f9e94cb12aa 100644 +--- a/kernel/sysctl.c ++++ b/kernel/sysctl.c +@@ -130,9 +130,19 @@ static int __maybe_unused four = 4; + static unsigned long zero_ul; + static unsigned long one_ul = 1; + static unsigned long long_max = LONG_MAX; +-static int one_hundred = 100; +-static int one_thousand = 1000; +-#ifdef CONFIG_PRINTK ++static int __read_mostly one_hundred = 100; ++static int __read_mostly one_thousand = 1000; ++static int zero = 0; ++static int one = 1; ++#ifdef CONFIG_SCHED_MUQSS ++extern int rr_interval; ++extern int sched_interactive; ++extern int sched_iso_cpu; ++extern int sched_yield_type; ++#endif ++extern int hrtimer_granularity_us; ++extern int hrtimeout_min_us; ++#if defined(CONFIG_PRINTK) || defined(CONFIG_SCHED_MUQSS) + static int ten_thousand = 10000; + #endif + #ifdef CONFIG_PERF_EVENTS +@@ -300,7 +310,7 @@ static struct ctl_table sysctl_base_table[] = { + { } + }; + +-#ifdef CONFIG_SCHED_DEBUG ++#if defined(CONFIG_SCHED_DEBUG) && !defined(CONFIG_SCHED_MUQSS) + static int min_sched_granularity_ns = 100000; /* 100 usecs */ + static int max_sched_granularity_ns = NSEC_PER_SEC; /* 1 second */ + static int min_wakeup_granularity_ns; /* 0 usecs */ +@@ -317,6 +327,7 @@ static int max_extfrag_threshold = 1000; + #endif + + static struct ctl_table kern_table[] = { ++#ifndef CONFIG_SCHED_MUQSS + { + .procname = "sched_child_runs_first", + .data = &sysctl_sched_child_runs_first, +@@ -498,6 +509,7 @@ static struct ctl_table kern_table[] = { + .extra2 = SYSCTL_ONE, + }, + #endif ++#endif /* !CONFIG_SCHED_MUQSS */ + #ifdef CONFIG_PROVE_LOCKING + { + .procname = "prove_locking", +@@ -1070,6 +1082,62 @@ static struct ctl_table kern_table[] = { + .proc_handler = proc_dointvec, + }, + #endif ++#ifdef CONFIG_SCHED_MUQSS ++ { ++ .procname = "rr_interval", ++ .data = &rr_interval, ++ .maxlen = sizeof (int), ++ .mode = 0644, ++ .proc_handler = &proc_dointvec_minmax, ++ .extra1 = &one, ++ .extra2 = &one_thousand, ++ }, ++ { ++ .procname = "interactive", ++ .data = &sched_interactive, ++ .maxlen = sizeof(int), ++ .mode = 0644, ++ .proc_handler = &proc_dointvec_minmax, ++ .extra1 = &zero, ++ .extra2 = &one, ++ }, ++ { ++ .procname = "iso_cpu", ++ .data = &sched_iso_cpu, ++ .maxlen = sizeof (int), ++ .mode = 0644, ++ .proc_handler = &proc_dointvec_minmax, ++ .extra1 = &zero, ++ .extra2 = &one_hundred, ++ }, ++ { ++ .procname = "yield_type", ++ .data = &sched_yield_type, ++ .maxlen = sizeof (int), ++ .mode = 0644, ++ .proc_handler = &proc_dointvec_minmax, ++ .extra1 = &zero, ++ .extra2 = &two, ++ }, ++#endif ++ { ++ .procname = "hrtimer_granularity_us", ++ .data = &hrtimer_granularity_us, ++ .maxlen = sizeof(int), ++ .mode = 0644, ++ .proc_handler = &proc_dointvec_minmax, ++ .extra1 = &one, ++ .extra2 = &ten_thousand, ++ }, ++ { ++ .procname = "hrtimeout_min_us", ++ .data = &hrtimeout_min_us, ++ .maxlen = sizeof(int), ++ .mode = 0644, ++ .proc_handler = &proc_dointvec_minmax, ++ .extra1 = &one, ++ .extra2 = &ten_thousand, ++ }, + #if defined(CONFIG_S390) && defined(CONFIG_SMP) + { + .procname = "spin_retry", +diff --git a/kernel/time/Kconfig b/kernel/time/Kconfig +index fcc42353f125..46bb16d3c159 100644 +--- a/kernel/time/Kconfig ++++ b/kernel/time/Kconfig +@@ -66,6 +66,9 @@ config NO_HZ_COMMON + depends on !ARCH_USES_GETTIMEOFFSET && GENERIC_CLOCKEVENTS + select TICK_ONESHOT + ++config NO_HZ_FULL ++ bool ++ + choice + prompt "Timer tick handling" + default NO_HZ_IDLE if NO_HZ +@@ -87,8 +90,9 @@ config NO_HZ_IDLE + + Most of the time you want to say Y here. + +-config NO_HZ_FULL ++config NO_HZ_FULL_NODEF + bool "Full dynticks system (tickless)" ++ select NO_HZ_FULL + # NO_HZ_COMMON dependency + depends on !ARCH_USES_GETTIMEOFFSET && GENERIC_CLOCKEVENTS + # We need at least one periodic CPU for timekeeping +@@ -114,6 +118,8 @@ config NO_HZ_FULL + transitions: syscalls, exceptions and interrupts. Even when it's + dynamically off. + ++ Not recommended for desktops,laptops, or mobile devices. ++ + Say N. + + endchoice +diff --git a/kernel/time/clockevents.c b/kernel/time/clockevents.c +index f5490222e134..544c58c29267 100644 +--- a/kernel/time/clockevents.c ++++ b/kernel/time/clockevents.c +@@ -190,8 +190,9 @@ int clockevents_tick_resume(struct clock_event_device *dev) + + #ifdef CONFIG_GENERIC_CLOCKEVENTS_MIN_ADJUST + +-/* Limit min_delta to a jiffie */ +-#define MIN_DELTA_LIMIT (NSEC_PER_SEC / HZ) ++int __read_mostly hrtimer_granularity_us = 100; ++/* Limit min_delta to 100us */ ++#define MIN_DELTA_LIMIT (hrtimer_granularity_us * NSEC_PER_USEC) + + /** + * clockevents_increase_min_delta - raise minimum delta of a clock event device +diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c +index 8de90ea31280..19523b9640cd 100644 +--- a/kernel/time/hrtimer.c ++++ b/kernel/time/hrtimer.c +@@ -2209,3 +2209,113 @@ int __sched schedule_hrtimeout(ktime_t *expires, + return schedule_hrtimeout_range(expires, 0, mode); + } + EXPORT_SYMBOL_GPL(schedule_hrtimeout); ++ ++/* ++ * As per schedule_hrtimeout but taskes a millisecond value and returns how ++ * many milliseconds are left. ++ */ ++long __sched schedule_msec_hrtimeout(long timeout) ++{ ++ struct hrtimer_sleeper t; ++ int delta, jiffs; ++ ktime_t expires; ++ ++ if (!timeout) { ++ __set_current_state(TASK_RUNNING); ++ return 0; ++ } ++ ++ jiffs = msecs_to_jiffies(timeout); ++ /* ++ * If regular timer resolution is adequate or hrtimer resolution is not ++ * (yet) better than Hz, as would occur during startup, use regular ++ * timers. ++ */ ++ if (jiffs > 4 || hrtimer_resolution >= NSEC_PER_SEC / HZ || pm_freezing) ++ return schedule_timeout(jiffs); ++ ++ delta = (timeout % 1000) * NSEC_PER_MSEC; ++ expires = ktime_set(0, delta); ++ ++ hrtimer_init_sleeper_on_stack(&t, CLOCK_MONOTONIC, HRTIMER_MODE_REL); ++ hrtimer_set_expires_range_ns(&t.timer, expires, delta); ++ ++ hrtimer_sleeper_start_expires(&t, HRTIMER_MODE_REL); ++ ++ if (likely(t.task)) ++ schedule(); ++ ++ hrtimer_cancel(&t.timer); ++ destroy_hrtimer_on_stack(&t.timer); ++ ++ __set_current_state(TASK_RUNNING); ++ ++ expires = hrtimer_expires_remaining(&t.timer); ++ timeout = ktime_to_ms(expires); ++ return timeout < 0 ? 0 : timeout; ++} ++ ++EXPORT_SYMBOL(schedule_msec_hrtimeout); ++ ++#define USECS_PER_SEC 1000000 ++extern int hrtimer_granularity_us; ++ ++static inline long schedule_usec_hrtimeout(long timeout) ++{ ++ struct hrtimer_sleeper t; ++ ktime_t expires; ++ int delta; ++ ++ if (!timeout) { ++ __set_current_state(TASK_RUNNING); ++ return 0; ++ } ++ ++ if (hrtimer_resolution >= NSEC_PER_SEC / HZ) ++ return schedule_timeout(usecs_to_jiffies(timeout)); ++ ++ if (timeout < hrtimer_granularity_us) ++ timeout = hrtimer_granularity_us; ++ delta = (timeout % USECS_PER_SEC) * NSEC_PER_USEC; ++ expires = ktime_set(0, delta); ++ ++ hrtimer_init_sleeper_on_stack(&t, CLOCK_MONOTONIC, HRTIMER_MODE_REL); ++ hrtimer_set_expires_range_ns(&t.timer, expires, delta); ++ ++ hrtimer_sleeper_start_expires(&t, HRTIMER_MODE_REL); ++ ++ if (likely(t.task)) ++ schedule(); ++ ++ hrtimer_cancel(&t.timer); ++ destroy_hrtimer_on_stack(&t.timer); ++ ++ __set_current_state(TASK_RUNNING); ++ ++ expires = hrtimer_expires_remaining(&t.timer); ++ timeout = ktime_to_us(expires); ++ return timeout < 0 ? 0 : timeout; ++} ++ ++int __read_mostly hrtimeout_min_us = 500; ++ ++long __sched schedule_min_hrtimeout(void) ++{ ++ return usecs_to_jiffies(schedule_usec_hrtimeout(hrtimeout_min_us)); ++} ++ ++EXPORT_SYMBOL(schedule_min_hrtimeout); ++ ++long __sched schedule_msec_hrtimeout_interruptible(long timeout) ++{ ++ __set_current_state(TASK_INTERRUPTIBLE); ++ return schedule_msec_hrtimeout(timeout); ++} ++EXPORT_SYMBOL(schedule_msec_hrtimeout_interruptible); ++ ++long __sched schedule_msec_hrtimeout_uninterruptible(long timeout) ++{ ++ __set_current_state(TASK_UNINTERRUPTIBLE); ++ return schedule_msec_hrtimeout(timeout); ++} ++EXPORT_SYMBOL(schedule_msec_hrtimeout_uninterruptible); +diff --git a/kernel/time/posix-cpu-timers.c b/kernel/time/posix-cpu-timers.c +index 42d512fcfda2..0db83bdf7f39 100644 +--- a/kernel/time/posix-cpu-timers.c ++++ b/kernel/time/posix-cpu-timers.c +@@ -226,7 +226,7 @@ static void task_sample_cputime(struct task_struct *p, u64 *samples) + u64 stime, utime; + + task_cputime(p, &utime, &stime); +- store_samples(samples, stime, utime, p->se.sum_exec_runtime); ++ store_samples(samples, stime, utime, tsk_seruntime(p)); + } + + static void proc_sample_cputime_atomic(struct task_cputime_atomic *at, +@@ -845,7 +845,7 @@ static void check_thread_timers(struct task_struct *tsk, + soft = task_rlimit(tsk, RLIMIT_RTTIME); + if (soft != RLIM_INFINITY) { + /* Task RT timeout is accounted in jiffies. RTTIME is usec */ +- unsigned long rttime = tsk->rt.timeout * (USEC_PER_SEC / HZ); ++ unsigned long rttime = tsk_rttimeout(tsk) * (USEC_PER_SEC / HZ); + unsigned long hard = task_rlimit_max(tsk, RLIMIT_RTTIME); + + /* At the hard limit, send SIGKILL. No further action. */ +diff --git a/kernel/time/timer.c b/kernel/time/timer.c +index 4820823515e9..13034cc7c9a4 100644 +--- a/kernel/time/timer.c ++++ b/kernel/time/timer.c +@@ -43,6 +43,7 @@ + #include + #include + #include ++#include + + #include + #include +@@ -1567,7 +1568,7 @@ static unsigned long __next_timer_interrupt(struct timer_base *base) + * Check, if the next hrtimer event is before the next timer wheel + * event: + */ +-static u64 cmp_next_hrtimer_event(u64 basem, u64 expires) ++static u64 cmp_next_hrtimer_event(struct timer_base *base, u64 basem, u64 expires) + { + u64 nextevt = hrtimer_get_next_event(); + +@@ -1585,6 +1586,9 @@ static u64 cmp_next_hrtimer_event(u64 basem, u64 expires) + if (nextevt <= basem) + return basem; + ++ if (nextevt < expires && nextevt - basem <= TICK_NSEC) ++ base->is_idle = false; ++ + /* + * Round up to the next jiffie. High resolution timers are + * off, so the hrtimers are expired in the tick and we need to +@@ -1654,7 +1658,7 @@ u64 get_next_timer_interrupt(unsigned long basej, u64 basem) + } + raw_spin_unlock(&base->lock); + +- return cmp_next_hrtimer_event(basem, expires); ++ return cmp_next_hrtimer_event(base, basem, expires); + } + + /** +@@ -1889,6 +1893,18 @@ signed long __sched schedule_timeout(signed long timeout) + + expire = timeout + jiffies; + ++#ifdef CONFIG_HIGH_RES_TIMERS ++ if (timeout == 1 && hrtimer_resolution < NSEC_PER_SEC / HZ) { ++ /* ++ * Special case 1 as being a request for the minimum timeout ++ * and use highres timers to timeout after 1ms to workaround ++ * the granularity of low Hz tick timers. ++ */ ++ if (!schedule_min_hrtimeout()) ++ return 0; ++ goto out_timeout; ++ } ++#endif + timer.task = current; + timer_setup_on_stack(&timer.timer, process_timeout, 0); + __mod_timer(&timer.timer, expire, 0); +@@ -1897,10 +1913,10 @@ signed long __sched schedule_timeout(signed long timeout) + + /* Remove the timer from the object tracker */ + destroy_timer_on_stack(&timer.timer); +- ++out_timeout: + timeout = expire - jiffies; + +- out: ++out: + return timeout < 0 ? 0 : timeout; + } + EXPORT_SYMBOL(schedule_timeout); +@@ -2042,7 +2058,19 @@ void __init init_timers(void) + */ + void msleep(unsigned int msecs) + { +- unsigned long timeout = msecs_to_jiffies(msecs) + 1; ++ int jiffs = msecs_to_jiffies(msecs); ++ unsigned long timeout; ++ ++ /* ++ * Use high resolution timers where the resolution of tick based ++ * timers is inadequate. ++ */ ++ if (jiffs < 5 && hrtimer_resolution < NSEC_PER_SEC / HZ && !pm_freezing) { ++ while (msecs) ++ msecs = schedule_msec_hrtimeout_uninterruptible(msecs); ++ return; ++ } ++ timeout = jiffs + 1; + + while (timeout) + timeout = schedule_timeout_uninterruptible(timeout); +@@ -2056,7 +2084,15 @@ EXPORT_SYMBOL(msleep); + */ + unsigned long msleep_interruptible(unsigned int msecs) + { +- unsigned long timeout = msecs_to_jiffies(msecs) + 1; ++ int jiffs = msecs_to_jiffies(msecs); ++ unsigned long timeout; ++ ++ if (jiffs < 5 && hrtimer_resolution < NSEC_PER_SEC / HZ && !pm_freezing) { ++ while (msecs && !signal_pending(current)) ++ msecs = schedule_msec_hrtimeout_interruptible(msecs); ++ return msecs; ++ } ++ timeout = jiffs + 1; + + while (timeout && !signal_pending(current)) + timeout = schedule_timeout_interruptible(timeout); +diff --git a/kernel/trace/trace_selftest.c b/kernel/trace/trace_selftest.c +index 69ee8ef12cee..6edb01f2fd81 100644 +--- a/kernel/trace/trace_selftest.c ++++ b/kernel/trace/trace_selftest.c +@@ -1048,10 +1048,15 @@ static int trace_wakeup_test_thread(void *data) + { + /* Make this a -deadline thread */ + static const struct sched_attr attr = { ++#ifdef CONFIG_SCHED_MUQSS ++ /* No deadline on MuQSS, use RR */ ++ .sched_policy = SCHED_RR, ++#else + .sched_policy = SCHED_DEADLINE, + .sched_runtime = 100000ULL, + .sched_deadline = 10000000ULL, + .sched_period = 10000000ULL ++#endif + }; + struct wakeup_test_data *x = data; + +diff --git a/mm/vmscan.c b/mm/vmscan.c +index 572fb17c6273..0e42b5dc689e 100644 +--- a/mm/vmscan.c ++++ b/mm/vmscan.c +@@ -177,7 +177,7 @@ struct scan_control { + /* + * From 0 .. 100. Higher means more swappy. + */ +-int vm_swappiness = 60; ++int vm_swappiness = 33; + /* + * The total number of pages which are beyond the high watermark within all + * zones. +diff --git a/net/core/pktgen.c b/net/core/pktgen.c +index 294bfcf0ce0e..81d0fce8cdee 100644 +--- a/net/core/pktgen.c ++++ b/net/core/pktgen.c +@@ -1894,7 +1894,7 @@ static void pktgen_mark_device(const struct pktgen_net *pn, const char *ifname) + mutex_unlock(&pktgen_thread_lock); + pr_debug("%s: waiting for %s to disappear....\n", + __func__, ifname); +- schedule_timeout_interruptible(msecs_to_jiffies(msec_per_try)); ++ schedule_msec_hrtimeout_interruptible((msec_per_try)); + mutex_lock(&pktgen_thread_lock); + + if (++i >= max_tries) { +diff --git a/sound/pci/maestro3.c b/sound/pci/maestro3.c +index cc8594d76c70..6752b44dd154 100644 +--- a/sound/pci/maestro3.c ++++ b/sound/pci/maestro3.c +@@ -2002,7 +2002,7 @@ static void snd_m3_ac97_reset(struct snd_m3 *chip) + outw(0, io + GPIO_DATA); + outw(dir | GPO_PRIMARY_AC97, io + GPIO_DIRECTION); + +- schedule_timeout_uninterruptible(msecs_to_jiffies(delay1)); ++ schedule_msec_hrtimeout_uninterruptible((delay1)); + + outw(GPO_PRIMARY_AC97, io + GPIO_DATA); + udelay(5); +@@ -2010,7 +2010,7 @@ static void snd_m3_ac97_reset(struct snd_m3 *chip) + outw(IO_SRAM_ENABLE | SERIAL_AC_LINK_ENABLE, io + RING_BUS_CTRL_A); + outw(~0, io + GPIO_MASK); + +- schedule_timeout_uninterruptible(msecs_to_jiffies(delay2)); ++ schedule_msec_hrtimeout_uninterruptible((delay2)); + + if (! snd_m3_try_read_vendor(chip)) + break; +diff --git a/sound/soc/codecs/rt5631.c b/sound/soc/codecs/rt5631.c +index f70b9f7e68bb..77b65398ca07 100644 +--- a/sound/soc/codecs/rt5631.c ++++ b/sound/soc/codecs/rt5631.c +@@ -415,7 +415,7 @@ static void onebit_depop_mute_stage(struct snd_soc_component *component, int ena + hp_zc = snd_soc_component_read32(component, RT5631_INT_ST_IRQ_CTRL_2); + snd_soc_component_write(component, RT5631_INT_ST_IRQ_CTRL_2, hp_zc & 0xf7ff); + if (enable) { +- schedule_timeout_uninterruptible(msecs_to_jiffies(10)); ++ schedule_msec_hrtimeout_uninterruptible((10)); + /* config one-bit depop parameter */ + rt5631_write_index(component, RT5631_SPK_INTL_CTRL, 0x307f); + snd_soc_component_update_bits(component, RT5631_HP_OUT_VOL, +@@ -525,7 +525,7 @@ static void depop_seq_mute_stage(struct snd_soc_component *component, int enable + hp_zc = snd_soc_component_read32(component, RT5631_INT_ST_IRQ_CTRL_2); + snd_soc_component_write(component, RT5631_INT_ST_IRQ_CTRL_2, hp_zc & 0xf7ff); + if (enable) { +- schedule_timeout_uninterruptible(msecs_to_jiffies(10)); ++ schedule_msec_hrtimeout_uninterruptible((10)); + + /* config depop sequence parameter */ + rt5631_write_index(component, RT5631_SPK_INTL_CTRL, 0x302f); +diff --git a/sound/soc/codecs/wm8350.c b/sound/soc/codecs/wm8350.c +index fe99584c917f..f1344d532a13 100644 +--- a/sound/soc/codecs/wm8350.c ++++ b/sound/soc/codecs/wm8350.c +@@ -233,10 +233,10 @@ static void wm8350_pga_work(struct work_struct *work) + out2->ramp == WM8350_RAMP_UP) { + /* delay is longer over 0dB as increases are larger */ + if (i >= WM8350_OUTn_0dB) +- schedule_timeout_interruptible(msecs_to_jiffies ++ schedule_msec_hrtimeout_interruptible( + (2)); + else +- schedule_timeout_interruptible(msecs_to_jiffies ++ schedule_msec_hrtimeout_interruptible( + (1)); + } else + udelay(50); /* doesn't matter if we delay longer */ +@@ -1120,7 +1120,7 @@ static int wm8350_set_bias_level(struct snd_soc_component *component, + (platform->dis_out4 << 6)); + + /* wait for discharge */ +- schedule_timeout_interruptible(msecs_to_jiffies ++ schedule_msec_hrtimeout_interruptible( + (platform-> + cap_discharge_msecs)); + +@@ -1136,7 +1136,7 @@ static int wm8350_set_bias_level(struct snd_soc_component *component, + WM8350_VBUFEN); + + /* wait for vmid */ +- schedule_timeout_interruptible(msecs_to_jiffies ++ schedule_msec_hrtimeout_interruptible( + (platform-> + vmid_charge_msecs)); + +@@ -1187,7 +1187,7 @@ static int wm8350_set_bias_level(struct snd_soc_component *component, + wm8350_reg_write(wm8350, WM8350_POWER_MGMT_1, pm1); + + /* wait */ +- schedule_timeout_interruptible(msecs_to_jiffies ++ schedule_msec_hrtimeout_interruptible( + (platform-> + vmid_discharge_msecs)); + +@@ -1205,7 +1205,7 @@ static int wm8350_set_bias_level(struct snd_soc_component *component, + pm1 | WM8350_OUTPUT_DRAIN_EN); + + /* wait */ +- schedule_timeout_interruptible(msecs_to_jiffies ++ schedule_msec_hrtimeout_interruptible( + (platform->drain_msecs)); + + pm1 &= ~WM8350_BIASEN; +diff --git a/sound/soc/codecs/wm8900.c b/sound/soc/codecs/wm8900.c +index 271235a69c01..3ec90e1b1eb4 100644 +--- a/sound/soc/codecs/wm8900.c ++++ b/sound/soc/codecs/wm8900.c +@@ -1109,7 +1109,7 @@ static int wm8900_set_bias_level(struct snd_soc_component *component, + /* Need to let things settle before stopping the clock + * to ensure that restart works, see "Stopping the + * master clock" in the datasheet. */ +- schedule_timeout_interruptible(msecs_to_jiffies(1)); ++ schedule_msec_hrtimeout_interruptible(1); + snd_soc_component_write(component, WM8900_REG_POWER2, + WM8900_REG_POWER2_SYSCLK_ENA); + break; +diff --git a/sound/soc/codecs/wm9713.c b/sound/soc/codecs/wm9713.c +index 6497c1ea6228..08fefeca9d82 100644 +--- a/sound/soc/codecs/wm9713.c ++++ b/sound/soc/codecs/wm9713.c +@@ -199,7 +199,7 @@ static int wm9713_voice_shutdown(struct snd_soc_dapm_widget *w, + + /* Gracefully shut down the voice interface. */ + snd_soc_component_update_bits(component, AC97_HANDSET_RATE, 0x0f00, 0x0200); +- schedule_timeout_interruptible(msecs_to_jiffies(1)); ++ schedule_msec_hrtimeout_interruptible(1); + snd_soc_component_update_bits(component, AC97_HANDSET_RATE, 0x0f00, 0x0f00); + snd_soc_component_update_bits(component, AC97_EXTENDED_MID, 0x1000, 0x1000); + +@@ -868,7 +868,7 @@ static int wm9713_set_pll(struct snd_soc_component *component, + wm9713->pll_in = freq_in; + + /* wait 10ms AC97 link frames for the link to stabilise */ +- schedule_timeout_interruptible(msecs_to_jiffies(10)); ++ schedule_msec_hrtimeout_interruptible((10)); + return 0; + } + +diff --git a/sound/soc/soc-dapm.c b/sound/soc/soc-dapm.c +index b6378f025836..5f5e58655d32 100644 +--- a/sound/soc/soc-dapm.c ++++ b/sound/soc/soc-dapm.c +@@ -154,7 +154,7 @@ static void dapm_assert_locked(struct snd_soc_dapm_context *dapm) + static void pop_wait(u32 pop_time) + { + if (pop_time) +- schedule_timeout_uninterruptible(msecs_to_jiffies(pop_time)); ++ schedule_msec_hrtimeout_uninterruptible((pop_time)); + } + + __printf(3, 4) +diff --git a/sound/usb/line6/pcm.c b/sound/usb/line6/pcm.c +index 9c437c716cfd..4e0a02f07bf5 100644 +--- a/sound/usb/line6/pcm.c ++++ b/sound/usb/line6/pcm.c +@@ -127,7 +127,7 @@ static void line6_wait_clear_audio_urbs(struct snd_line6_pcm *line6pcm, + if (!alive) + break; + set_current_state(TASK_UNINTERRUPTIBLE); +- schedule_timeout(1); ++ schedule_min_hrtimeout(); + } while (--timeout > 0); + if (alive) + dev_err(line6pcm->line6->ifcdev, diff --git a/linux55-tkg/linux55-tkg-patches/0004-glitched-muqss.patch b/linux55-tkg/linux55-tkg-patches/0004-glitched-muqss.patch new file mode 100644 index 0000000..2c4837e --- /dev/null +++ b/linux55-tkg/linux55-tkg-patches/0004-glitched-muqss.patch @@ -0,0 +1,78 @@ +From f7f49141a5dbe9c99d78196b58c44307fb2e6be3 Mon Sep 17 00:00:00 2001 +From: Tk-Glitch +Date: Wed, 4 Jul 2018 04:30:08 +0200 +Subject: glitched - MuQSS + +diff --git a/kernel/sched/MuQSS.c b/kernel/sched/MuQSS.c +index 84a1d08d68551..57c3036a68952 100644 +--- a/kernel/sched/MuQSS.c ++++ b/kernel/sched/MuQSS.c +@@ -163,7 +167,11 @@ int sched_interactive __read_mostly = 1; + * are allowed to run five seconds as real time tasks. This is the total over + * all online cpus. + */ ++#ifdef CONFIG_ZENIFY ++int sched_iso_cpu __read_mostly = 25; ++#else + int sched_iso_cpu __read_mostly = 70; ++#endif + + /* + * sched_yield_type - Choose what sort of yield sched_yield will perform. + +diff --git a/kernel/Kconfig.hz b/kernel/Kconfig.hz +index 2a202a846757..1d9c7ed79b11 100644 +--- a/kernel/Kconfig.hz ++++ b/kernel/Kconfig.hz +@@ -5,7 +5,7 @@ + choice + prompt "Timer frequency" + default HZ_100 if SCHED_MUQSS +- default HZ_250_NODEF if !SCHED_MUQSS ++ default HZ_500_NODEF if !SCHED_MUQSS + help + Allows the configuration of the timer frequency. It is customary + to have the timer interrupt run at 1000 Hz but 100 Hz may be more +@@ -50,6 +50,20 @@ choice + on SMP and NUMA systems and exactly dividing by both PAL and + NTSC frame rates for video and multimedia work. + ++ config HZ_500_NODEF ++ bool "500 HZ" ++ help ++ 500 Hz is a good timer frequency for desktops. Provides fast ++ interactivity with great smoothness without sacrificing too ++ much throughput. ++ ++ config HZ_750_NODEF ++ bool "750 HZ" ++ help ++ 750 Hz is a good timer frequency for desktops. Provides fast ++ interactivity with great smoothness without sacrificing too ++ much throughput. ++ + config HZ_1000_NODEF + bool "1000 HZ" + help +@@ -63,6 +70,8 @@ config HZ + default 100 if HZ_100 + default 250 if HZ_250_NODEF + default 300 if HZ_300_NODEF ++ default 500 if HZ_500_NODEF ++ default 750 if HZ_750_NODEF + default 1000 if HZ_1000_NODEF + + config SCHED_HRTICK + +diff --git a/Makefile b/Makefile +index d4d36c61940b..4a9dfe471f1f 100644 +--- a/Makefile ++++ b/Makefile +@@ -15,7 +15,6 @@ NAME = Kleptomaniac Octopus + + CKVERSION = -ck1 + CKNAME = MuQSS Powered +-EXTRAVERSION := $(EXTRAVERSION)$(CKVERSION) + + # We are using a recursive build, so we need to do a little thinking + # to get the ordering right. diff --git a/linux55-tkg/linux55-tkg-patches/0004-glitched-ondemand-muqss.patch b/linux55-tkg/linux55-tkg-patches/0004-glitched-ondemand-muqss.patch new file mode 100644 index 0000000..02933e4 --- /dev/null +++ b/linux55-tkg/linux55-tkg-patches/0004-glitched-ondemand-muqss.patch @@ -0,0 +1,18 @@ +diff --git a/drivers/cpufreq/cpufreq_ondemand.c b/drivers/cpufreq/cpufreq_ondemand.c +index 6b423eebfd5d..61e3271675d6 100644 +--- a/drivers/cpufreq/cpufreq_ondemand.c ++++ b/drivers/cpufreq/cpufreq_ondemand.c +@@ -21,10 +21,10 @@ + #include "cpufreq_ondemand.h" + + /* On-demand governor macros */ +-#define DEF_FREQUENCY_UP_THRESHOLD (80) +-#define DEF_SAMPLING_DOWN_FACTOR (1) ++#define DEF_FREQUENCY_UP_THRESHOLD (45) ++#define DEF_SAMPLING_DOWN_FACTOR (5) + #define MAX_SAMPLING_DOWN_FACTOR (100000) +-#define MICRO_FREQUENCY_UP_THRESHOLD (95) ++#define MICRO_FREQUENCY_UP_THRESHOLD (45) + #define MICRO_FREQUENCY_MIN_SAMPLE_RATE (10000) + #define MIN_FREQUENCY_UP_THRESHOLD (1) + #define MAX_FREQUENCY_UP_THRESHOLD (100) diff --git a/linux55-tkg/linux55-tkg-patches/0005-glitched-ondemand-pds.patch b/linux55-tkg/linux55-tkg-patches/0005-glitched-ondemand-pds.patch new file mode 100644 index 0000000..c1929e8 --- /dev/null +++ b/linux55-tkg/linux55-tkg-patches/0005-glitched-ondemand-pds.patch @@ -0,0 +1,18 @@ +diff --git a/drivers/cpufreq/cpufreq_ondemand.c b/drivers/cpufreq/cpufreq_ondemand.c +index 6b423eebfd5d..61e3271675d6 100644 +--- a/drivers/cpufreq/cpufreq_ondemand.c ++++ b/drivers/cpufreq/cpufreq_ondemand.c +@@ -21,10 +21,10 @@ + #include "cpufreq_ondemand.h" + + /* On-demand governor macros */ +-#define DEF_FREQUENCY_UP_THRESHOLD (63) +-#define DEF_SAMPLING_DOWN_FACTOR (1) ++#define DEF_FREQUENCY_UP_THRESHOLD (55) ++#define DEF_SAMPLING_DOWN_FACTOR (5) + #define MAX_SAMPLING_DOWN_FACTOR (100000) +-#define MICRO_FREQUENCY_UP_THRESHOLD (95) ++#define MICRO_FREQUENCY_UP_THRESHOLD (63) + #define MICRO_FREQUENCY_MIN_SAMPLE_RATE (10000) + #define MIN_FREQUENCY_UP_THRESHOLD (1) + #define MAX_FREQUENCY_UP_THRESHOLD (100) diff --git a/linux55-tkg/linux55-tkg-patches/0005-glitched-pds.patch b/linux55-tkg/linux55-tkg-patches/0005-glitched-pds.patch new file mode 100644 index 0000000..23271f5 --- /dev/null +++ b/linux55-tkg/linux55-tkg-patches/0005-glitched-pds.patch @@ -0,0 +1,166 @@ +From f7f49141a5dbe9c99d78196b58c44307fb2e6be3 Mon Sep 17 00:00:00 2001 +From: Tk-Glitch +Date: Wed, 4 Jul 2018 04:30:08 +0200 +Subject: glitched - PDS + +diff --git a/kernel/Kconfig.hz b/kernel/Kconfig.hz +index 2a202a846757..1d9c7ed79b11 100644 +--- a/kernel/Kconfig.hz ++++ b/kernel/Kconfig.hz +@@ -4,7 +4,7 @@ + + choice + prompt "Timer frequency" +- default HZ_250 ++ default HZ_500 + help + Allows the configuration of the timer frequency. It is customary + to have the timer interrupt run at 1000 Hz but 100 Hz may be more +@@ -39,6 +39,13 @@ choice + on SMP and NUMA systems and exactly dividing by both PAL and + NTSC frame rates for video and multimedia work. + ++ config HZ_500 ++ bool "500 HZ" ++ help ++ 500 Hz is a balanced timer frequency. Provides fast interactivity ++ on desktops with great smoothness without increasing CPU power ++ consumption and sacrificing the battery life on laptops. ++ + config HZ_1000 + bool "1000 HZ" + help +@@ -52,6 +59,7 @@ config HZ + default 100 if HZ_100 + default 250 if HZ_250 + default 300 if HZ_300 ++ default 500 if HZ_500 + default 1000 if HZ_1000 + + config SCHED_HRTICK + +diff --git a/kernel/Kconfig.hz b/kernel/Kconfig.hz +index 2a202a846757..1d9c7ed79b11 100644 +--- a/kernel/Kconfig.hz ++++ b/kernel/Kconfig.hz +@@ -4,7 +4,7 @@ + + choice + prompt "Timer frequency" +- default HZ_500 ++ default HZ_750 + help + Allows the configuration of the timer frequency. It is customary + to have the timer interrupt run at 1000 Hz but 100 Hz may be more +@@ -46,6 +46,13 @@ choice + on desktops with great smoothness without increasing CPU power + consumption and sacrificing the battery life on laptops. + ++ config HZ_750 ++ bool "750 HZ" ++ help ++ 750 Hz is a good timer frequency for desktops. Provides fast ++ interactivity with great smoothness without sacrificing too ++ much throughput. ++ + config HZ_1000 + bool "1000 HZ" + help +@@ -60,6 +67,7 @@ config HZ + default 250 if HZ_250 + default 300 if HZ_300 + default 500 if HZ_500 ++ default 750 if HZ_750 + default 1000 if HZ_1000 + + config SCHED_HRTICK + +diff --git a/mm/vmscan.c b/mm/vmscan.c +index 9270a4370d54..30d01e647417 100644 +--- a/mm/vmscan.c ++++ b/mm/vmscan.c +@@ -159,7 +159,7 @@ struct scan_control { + /* + * From 0 .. 100. Higher means more swappy. + */ +-int vm_swappiness = 60; ++int vm_swappiness = 20; + /* + * The total number of pages which are beyond the high watermark within all + * zones. + +diff --git a/init/Kconfig b/init/Kconfig +index 11fd9b502d06..e9bc34d3019b 100644 +--- a/init/Kconfig ++++ b/init/Kconfig +@@ -715,6 +715,7 @@ menu "Scheduler features" + config UCLAMP_TASK + bool "Enable utilization clamping for RT/FAIR tasks" + depends on CPU_FREQ_GOV_SCHEDUTIL ++ depends on !SCHED_PDS + help + This feature enables the scheduler to track the clamped utilization + of each CPU based on RUNNABLE tasks scheduled on that CPU. +@@ -948,7 +948,6 @@ config CGROUP_DEVICE + + config CGROUP_CPUACCT + bool "Simple CPU accounting controller" +- depends on !SCHED_PDS + help + Provides a simple controller for monitoring the + total CPU consumed by the tasks in a cgroup. +diff --git a/kernel/sched/Makefile b/kernel/sched/Makefile +index b23231bae996..cab4e5c5b38e 100644 +--- a/kernel/sched/Makefile ++++ b/kernel/sched/Makefile +@@ -24,13 +24,13 @@ obj-y += fair.o rt.o deadline.o + obj-$(CONFIG_SMP) += cpudeadline.o topology.o stop_task.o + obj-$(CONFIG_SCHED_AUTOGROUP) += autogroup.o + obj-$(CONFIG_SCHED_DEBUG) += debug.o +-obj-$(CONFIG_CGROUP_CPUACCT) += cpuacct.o + endif + obj-y += loadavg.o clock.o cputime.o + obj-y += idle.o + obj-y += wait.o wait_bit.o swait.o completion.o + obj-$(CONFIG_SMP) += cpupri.o pelt.o + obj-$(CONFIG_SCHEDSTATS) += stats.o ++obj-$(CONFIG_CGROUP_CPUACCT) += cpuacct.o + obj-$(CONFIG_CPU_FREQ) += cpufreq.o + obj-$(CONFIG_CPU_FREQ_GOV_SCHEDUTIL) += cpufreq_schedutil.o + obj-$(CONFIG_MEMBARRIER) += membarrier.o + +diff --git a/kernel/sched/pds.c b/kernel/sched/pds.c +index 9281ad164..f09a609cf 100644 +--- a/kernel/sched/pds.c ++++ b/kernel/sched/pds.c +@@ -81,6 +81,18 @@ enum { + NR_CPU_AFFINITY_CHK_LEVEL + }; + ++/* ++ * This allows printing both to /proc/sched_debug and ++ * to the console ++ */ ++#define SEQ_printf(m, x...) \ ++ do { \ ++ if (m) \ ++ seq_printf(m, x); \ ++ else \ ++ pr_cont(x); \ ++ } while (0) ++ + static inline void print_scheduler_version(void) + { + printk(KERN_INFO "pds: PDS-mq CPU Scheduler 0.99o by Alfred Chen.\n"); +@@ -6353,7 +6365,10 @@ void ia64_set_curr_task(int cpu, struct task_struct *p) + #ifdef CONFIG_SCHED_DEBUG + void proc_sched_show_task(struct task_struct *p, struct pid_namespace *ns, + struct seq_file *m) +-{} ++{ ++ SEQ_printf(m, "%s (%d, #threads: %d)\n", p->comm, task_pid_nr_ns(p, ns), ++ get_nr_threads(p)); ++} + + void proc_sched_set_task(struct task_struct *p) + {} diff --git a/linux55-tkg/linux55-tkg-patches/0005-v5.5_undead-pds099o.patch b/linux55-tkg/linux55-tkg-patches/0005-v5.5_undead-pds099o.patch new file mode 100644 index 0000000..0e1c215 --- /dev/null +++ b/linux55-tkg/linux55-tkg-patches/0005-v5.5_undead-pds099o.patch @@ -0,0 +1,8369 @@ +From 89067d28ca90681fc6cf108de79b9aedb93dfa9d Mon Sep 17 00:00:00 2001 +From: Tk-Glitch +Date: Mon, 9 Dec 2019 7:11:23 +0100 +Subject: PDS 099o, 5.5rc1 rebase + + +diff --git a/Documentation/admin-guide/sysctl/kernel.rst b/Documentation/admin-guide/sysctl/kernel.rst +index 032c7cd3cede..360a229b0abe 100644 +--- a/Documentation/admin-guide/sysctl/kernel.rst ++++ b/Documentation/admin-guide/sysctl/kernel.rst +@@ -82,6 +82,7 @@ show up in /proc/sys/kernel: + - randomize_va_space + - real-root-dev ==> Documentation/admin-guide/initrd.rst + - reboot-cmd [ SPARC only ] ++- rr_interval + - rtsig-max + - rtsig-nr + - sched_energy_aware +@@ -105,6 +106,7 @@ show up in /proc/sys/kernel: + - unknown_nmi_panic + - watchdog + - watchdog_thresh ++- yield_type + - version + + +diff --git a/Documentation/scheduler/sched-PDS-mq.txt b/Documentation/scheduler/sched-PDS-mq.txt +new file mode 100644 +index 000000000000..709e86f6487e +--- /dev/null ++++ b/Documentation/scheduler/sched-PDS-mq.txt +@@ -0,0 +1,56 @@ ++ Priority and Deadline based Skiplist multiple queue Scheduler ++ ------------------------------------------------------------- ++ ++CONTENT ++======== ++ ++ 0. Development ++ 1. Overview ++ 1.1 Design goal ++ 1.2 Design summary ++ 2. Design Detail ++ 2.1 Skip list implementation ++ 2.2 Task preempt ++ 2.3 Task policy, priority and deadline ++ 2.4 Task selection ++ 2.5 Run queue balance ++ 2.6 Task migration ++ ++ ++0. Development ++============== ++ ++Priority and Deadline based Skiplist multiple queue scheduler, referred to as ++PDS from here on, is developed upon the enhancement patchset VRQ(Variable Run ++Queue) for BFS(Brain Fuck Scheduler by Con Kolivas). PDS inherits the existing ++design from VRQ and inspired by the introduction of skiplist data structure ++to the scheduler by Con Kolivas. However, PDS is different from MuQSS(Multiple ++Queue Skiplist Scheduler, the successor after BFS) in many ways. ++ ++1. Overview ++=========== ++ ++1.1 Design goal ++--------------- ++ ++PDS is designed to make the cpu process scheduler code to be simple, but while ++efficiency and scalable. Be Simple, the scheduler code will be easy to be read ++and the behavious of scheduler will be easy to predict. Be efficiency, the ++scheduler shall be well balance the thoughput performance and task interactivity ++at the same time for different properties the tasks behave. Be scalable, the ++performance of the scheduler should be in good shape with the glowing of ++workload or with the growing of the cpu numbers. ++ ++1.2 Design summary ++------------------ ++ ++PDS is described as a multiple run queues cpu scheduler. Each cpu has its own ++run queue. A heavry customized skiplist is used as the backend data structure ++of the cpu run queue. Tasks in run queue is sorted by priority then virtual ++deadline(simplfy to just deadline from here on). In PDS, balance action among ++run queues are kept as less as possible to reduce the migration cost. Cpumask ++data structure is widely used in cpu affinity checking and cpu preemption/ ++selection to make PDS scalable with increasing cpu number. ++ ++ ++To be continued... +diff --git a/arch/powerpc/platforms/cell/spufs/sched.c b/arch/powerpc/platforms/cell/spufs/sched.c +index f18d5067cd0f..fe489fc01c73 100644 +--- a/arch/powerpc/platforms/cell/spufs/sched.c ++++ b/arch/powerpc/platforms/cell/spufs/sched.c +@@ -51,11 +51,6 @@ static struct task_struct *spusched_task; + static struct timer_list spusched_timer; + static struct timer_list spuloadavg_timer; + +-/* +- * Priority of a normal, non-rt, non-niced'd process (aka nice level 0). +- */ +-#define NORMAL_PRIO 120 +- + /* + * Frequency of the spu scheduler tick. By default we do one SPU scheduler + * tick for every 10 CPU scheduler ticks. +diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig +index 8ef85139553f..9d44d8d78259 100644 +--- a/arch/x86/Kconfig ++++ b/arch/x86/Kconfig +@@ -1034,6 +1034,22 @@ config NR_CPUS + config SCHED_SMT + def_bool y if SMP + ++config SMT_NICE ++ bool "SMT (Hyperthreading) aware nice priority and policy support" ++ depends on SCHED_PDS && SCHED_SMT ++ default y ++ ---help--- ++ Enabling Hyperthreading on Intel CPUs decreases the effectiveness ++ of the use of 'nice' levels and different scheduling policies ++ (e.g. realtime) due to sharing of CPU power between hyperthreads. ++ SMT nice support makes each logical CPU aware of what is running on ++ its hyperthread siblings, maintaining appropriate distribution of ++ CPU according to nice levels and scheduling policies at the expense ++ of slightly increased overhead. ++ ++ If unsure say Y here. ++ ++ + config SCHED_MC + def_bool y + prompt "Multi-core scheduler support" +diff --git a/drivers/cpufreq/cpufreq_conservative.c b/drivers/cpufreq/cpufreq_conservative.c +index b66e81c06a57..a294f8f5fd75 100644 +--- a/drivers/cpufreq/cpufreq_conservative.c ++++ b/drivers/cpufreq/cpufreq_conservative.c +@@ -28,8 +28,8 @@ struct cs_dbs_tuners { + }; + + /* Conservative governor macros */ +-#define DEF_FREQUENCY_UP_THRESHOLD (80) +-#define DEF_FREQUENCY_DOWN_THRESHOLD (20) ++#define DEF_FREQUENCY_UP_THRESHOLD (63) ++#define DEF_FREQUENCY_DOWN_THRESHOLD (26) + #define DEF_FREQUENCY_STEP (5) + #define DEF_SAMPLING_DOWN_FACTOR (1) + #define MAX_SAMPLING_DOWN_FACTOR (10) +diff --git a/drivers/cpufreq/cpufreq_ondemand.c b/drivers/cpufreq/cpufreq_ondemand.c +index dced033875bf..d2cd03766b09 100644 +--- a/drivers/cpufreq/cpufreq_ondemand.c ++++ b/drivers/cpufreq/cpufreq_ondemand.c +@@ -18,7 +18,7 @@ + #include "cpufreq_ondemand.h" + + /* On-demand governor macros */ +-#define DEF_FREQUENCY_UP_THRESHOLD (80) ++#define DEF_FREQUENCY_UP_THRESHOLD (63) + #define DEF_SAMPLING_DOWN_FACTOR (1) + #define MAX_SAMPLING_DOWN_FACTOR (100000) + #define MICRO_FREQUENCY_UP_THRESHOLD (95) +@@ -127,7 +127,7 @@ static void dbs_freq_increase(struct cpufreq_policy *policy, unsigned int freq) + } + + /* +- * Every sampling_rate, we check, if current idle time is less than 20% ++ * Every sampling_rate, we check, if current idle time is less than 37% + * (default), then we try to increase frequency. Else, we adjust the frequency + * proportional to load. + */ +diff --git a/fs/proc/base.c b/fs/proc/base.c +index ebea9501afb8..51c9346a69fe 100644 +--- a/fs/proc/base.c ++++ b/fs/proc/base.c +@@ -477,7 +477,7 @@ static int proc_pid_schedstat(struct seq_file *m, struct pid_namespace *ns, + seq_puts(m, "0 0 0\n"); + else + seq_printf(m, "%llu %llu %lu\n", +- (unsigned long long)task->se.sum_exec_runtime, ++ (unsigned long long)tsk_seruntime(task), + (unsigned long long)task->sched_info.run_delay, + task->sched_info.pcount); + +diff --git a/include/linux/init_task.h b/include/linux/init_task.h +index 2c620d7ac432..1a7987c40c80 100644 +--- a/include/linux/init_task.h ++++ b/include/linux/init_task.h +@@ -36,7 +36,11 @@ extern struct cred init_cred; + #define INIT_PREV_CPUTIME(x) + #endif + ++#ifdef CONFIG_SCHED_PDS ++#define INIT_TASK_COMM "PDS" ++#else + #define INIT_TASK_COMM "swapper" ++#endif /* !CONFIG_SCHED_PDS */ + + /* Attach to the init_task data structure for proper alignment */ + #ifdef CONFIG_ARCH_TASK_STRUCT_ON_STACK +diff --git a/include/linux/jiffies.h b/include/linux/jiffies.h +index 1b6d31da7cbc..dea181bdb1dd 100644 +--- a/include/linux/jiffies.h ++++ b/include/linux/jiffies.h +@@ -171,7 +171,7 @@ static inline u64 get_jiffies_64(void) + * Have the 32 bit jiffies value wrap 5 minutes after boot + * so jiffies wrap bugs show up earlier. + */ +-#define INITIAL_JIFFIES ((unsigned long)(unsigned int) (-300*HZ)) ++#define INITIAL_JIFFIES ((unsigned long)(unsigned int) (-10*HZ)) + + /* + * Change timeval to jiffies, trying to avoid the +diff --git a/include/linux/sched.h b/include/linux/sched.h +index 67a1d86981a9..8268cad4b0a2 100644 +--- a/include/linux/sched.h ++++ b/include/linux/sched.h +@@ -31,6 +31,7 @@ + #include + #include + #include ++#include + + /* task_struct member predeclarations (sorted alphabetically): */ + struct audit_context; +@@ -644,9 +645,13 @@ struct task_struct { + unsigned int flags; + unsigned int ptrace; + +-#ifdef CONFIG_SMP ++#if defined(CONFIG_SMP) && !defined(CONFIG_SCHED_PDS) + struct llist_node wake_entry; ++#endif ++#if defined(CONFIG_SMP) || defined(CONFIG_SCHED_PDS) + int on_cpu; ++#endif ++#ifdef CONFIG_SMP + #ifdef CONFIG_THREAD_INFO_IN_TASK + /* Current CPU: */ + unsigned int cpu; +@@ -655,6 +660,7 @@ struct task_struct { + unsigned long wakee_flip_decay_ts; + struct task_struct *last_wakee; + ++#ifndef CONFIG_SCHED_PDS + /* + * recent_used_cpu is initially set as the last CPU used by a task + * that wakes affine another task. Waker/wakee relationships can +@@ -663,6 +669,7 @@ struct task_struct { + * used CPU that may be idle. + */ + int recent_used_cpu; ++#endif /* CONFIG_SCHED_PDS */ + int wake_cpu; + #endif + int on_rq; +@@ -672,13 +679,27 @@ struct task_struct { + int normal_prio; + unsigned int rt_priority; + ++#ifdef CONFIG_SCHED_PDS ++ int time_slice; ++ u64 deadline; ++ /* skip list level */ ++ int sl_level; ++ /* skip list node */ ++ struct skiplist_node sl_node; ++ /* 8bits prio and 56bits deadline for quick processing */ ++ u64 priodl; ++ u64 last_ran; ++ /* sched_clock time spent running */ ++ u64 sched_time; ++#else /* CONFIG_SCHED_PDS */ + const struct sched_class *sched_class; + struct sched_entity se; + struct sched_rt_entity rt; ++ struct sched_dl_entity dl; ++#endif + #ifdef CONFIG_CGROUP_SCHED + struct task_group *sched_task_group; + #endif +- struct sched_dl_entity dl; + + #ifdef CONFIG_UCLAMP_TASK + /* Clamp values requested for a scheduling entity */ +@@ -1283,6 +1304,29 @@ struct task_struct { + */ + }; + ++#ifdef CONFIG_SCHED_PDS ++void cpu_scaling(int cpu); ++void cpu_nonscaling(int cpu); ++#define tsk_seruntime(t) ((t)->sched_time) ++/* replace the uncertian rt_timeout with 0UL */ ++#define tsk_rttimeout(t) (0UL) ++ ++#define task_running_idle(p) ((p)->prio == IDLE_PRIO) ++#else /* CFS */ ++extern int runqueue_is_locked(int cpu); ++static inline void cpu_scaling(int cpu) ++{ ++} ++ ++static inline void cpu_nonscaling(int cpu) ++{ ++} ++#define tsk_seruntime(t) ((t)->se.sum_exec_runtime) ++#define tsk_rttimeout(t) ((t)->rt.timeout) ++ ++#define iso_task(p) (false) ++#endif /* CONFIG_SCHED_PDS */ ++ + static inline struct pid *task_pid(struct task_struct *task) + { + return task->thread_pid; +diff --git a/include/linux/sched/deadline.h b/include/linux/sched/deadline.h +index 1aff00b65f3c..a5e5fc2c9170 100644 +--- a/include/linux/sched/deadline.h ++++ b/include/linux/sched/deadline.h +@@ -1,5 +1,22 @@ + /* SPDX-License-Identifier: GPL-2.0 */ + ++#ifdef CONFIG_SCHED_PDS ++ ++#define __tsk_deadline(p) ((p)->deadline) ++ ++static inline int dl_prio(int prio) ++{ ++ return 1; ++} ++ ++static inline int dl_task(struct task_struct *p) ++{ ++ return 1; ++} ++#else ++ ++#define __tsk_deadline(p) ((p)->dl.deadline) ++ + /* + * SCHED_DEADLINE tasks has negative priorities, reflecting + * the fact that any of them has higher prio than RT and +@@ -19,6 +36,7 @@ static inline int dl_task(struct task_struct *p) + { + return dl_prio(p->prio); + } ++#endif /* CONFIG_SCHED_PDS */ + + static inline bool dl_time_before(u64 a, u64 b) + { +diff --git a/include/linux/sched/prio.h b/include/linux/sched/prio.h +index 7d64feafc408..fba04bb91492 100644 +--- a/include/linux/sched/prio.h ++++ b/include/linux/sched/prio.h +@@ -20,7 +20,18 @@ + */ + + #define MAX_USER_RT_PRIO 100 ++ ++#ifdef CONFIG_SCHED_PDS ++#define ISO_PRIO (MAX_USER_RT_PRIO) ++ ++#define MAX_RT_PRIO ((MAX_USER_RT_PRIO) + 1) ++ ++#define NORMAL_PRIO (MAX_RT_PRIO) ++#define IDLE_PRIO ((MAX_RT_PRIO) + 1) ++#define PRIO_LIMIT ((IDLE_PRIO) + 1) ++#else /* !CONFIG_SCHED_PDS */ + #define MAX_RT_PRIO MAX_USER_RT_PRIO ++#endif /* CONFIG_SCHED_PDS */ + + #define MAX_PRIO (MAX_RT_PRIO + NICE_WIDTH) + #define DEFAULT_PRIO (MAX_RT_PRIO + NICE_WIDTH / 2) +diff --git a/include/linux/sched/rt.h b/include/linux/sched/rt.h +index e5af028c08b4..a96012e6f15e 100644 +--- a/include/linux/sched/rt.h ++++ b/include/linux/sched/rt.h +@@ -24,8 +24,10 @@ static inline bool task_is_realtime(struct task_struct *tsk) + + if (policy == SCHED_FIFO || policy == SCHED_RR) + return true; ++#ifndef CONFIG_SCHED_PDS + if (policy == SCHED_DEADLINE) + return true; ++#endif + return false; + } + +diff --git a/include/linux/sched/task.h b/include/linux/sched/task.h +index 4b1c3b664f51..f186b8119ad6 100644 +--- a/include/linux/sched/task.h ++++ b/include/linux/sched/task.h +@@ -99,7 +99,7 @@ extern long kernel_wait4(pid_t, int __user *, int, struct rusage *); + extern void free_task(struct task_struct *tsk); + + /* sched_exec is called by processes performing an exec */ +-#ifdef CONFIG_SMP ++#if defined(CONFIG_SMP) && !defined(CONFIG_SCHED_PDS) + extern void sched_exec(void); + #else + #define sched_exec() {} +diff --git a/include/linux/skip_list.h b/include/linux/skip_list.h +new file mode 100644 +index 000000000000..713fedd8034f +--- /dev/null ++++ b/include/linux/skip_list.h +@@ -0,0 +1,177 @@ ++/* ++ Copyright (C) 2016 Alfred Chen. ++ ++ Code based on Con Kolivas's skip list implementation for BFS, and ++ which is based on example originally by William Pugh. ++ ++Skip Lists are a probabilistic alternative to balanced trees, as ++described in the June 1990 issue of CACM and were invented by ++William Pugh in 1987. ++ ++A couple of comments about this implementation: ++ ++This file only provides a infrastructure of skip list. ++ ++skiplist_node is embedded into container data structure, to get rid the ++dependency of kmalloc/kfree operation in scheduler code. ++ ++A customized search function should be defined using DEFINE_SKIPLIST_INSERT ++macro and be used for skip list insert operation. ++ ++Random Level is also not defined in this file, instead, it should be customized ++implemented and set to node->level then pass to the customized skiplist_insert ++function. ++ ++Levels start at zero and go up to (NUM_SKIPLIST_LEVEL -1) ++ ++NUM_SKIPLIST_LEVEL in this implementation is 8 instead of origin 16, ++considering that there will be 256 entries to enable the top level when using ++random level p=0.5, and that number is more than enough for a run queue usage ++in a scheduler usage. And it also help to reduce the memory usage of the ++embedded skip list node in task_struct to about 50%. ++ ++The insertion routine has been implemented so as to use the ++dirty hack described in the CACM paper: if a random level is ++generated that is more than the current maximum level, the ++current maximum level plus one is used instead. ++ ++BFS Notes: In this implementation of skiplists, there are bidirectional ++next/prev pointers and the insert function returns a pointer to the actual ++node the value is stored. The key here is chosen by the scheduler so as to ++sort tasks according to the priority list requirements and is no longer used ++by the scheduler after insertion. The scheduler lookup, however, occurs in ++O(1) time because it is always the first item in the level 0 linked list. ++Since the task struct stores a copy of the node pointer upon skiplist_insert, ++it can also remove it much faster than the original implementation with the ++aid of prev<->next pointer manipulation and no searching. ++*/ ++#ifndef _LINUX_SKIP_LIST_H ++#define _LINUX_SKIP_LIST_H ++ ++#include ++ ++#define NUM_SKIPLIST_LEVEL (8) ++ ++struct skiplist_node { ++ int level; /* Levels in this node */ ++ struct skiplist_node *next[NUM_SKIPLIST_LEVEL]; ++ struct skiplist_node *prev[NUM_SKIPLIST_LEVEL]; ++}; ++ ++#define SKIPLIST_NODE_INIT(name) { 0,\ ++ {&name, &name, &name, &name,\ ++ &name, &name, &name, &name},\ ++ {&name, &name, &name, &name,\ ++ &name, &name, &name, &name},\ ++ } ++ ++static inline void INIT_SKIPLIST_NODE(struct skiplist_node *node) ++{ ++ /* only level 0 ->next matters in skiplist_empty()*/ ++ WRITE_ONCE(node->next[0], node); ++} ++ ++/** ++ * FULL_INIT_SKIPLIST_NODE -- fully init a skiplist_node, expecially for header ++ * @node: the skip list node to be inited. ++ */ ++static inline void FULL_INIT_SKIPLIST_NODE(struct skiplist_node *node) ++{ ++ int i; ++ ++ node->level = 0; ++ for (i = 0; i < NUM_SKIPLIST_LEVEL; i++) { ++ WRITE_ONCE(node->next[i], node); ++ node->prev[i] = node; ++ } ++} ++ ++/** ++ * skiplist_empty - test whether a skip list is empty ++ * @head: the skip list to test. ++ */ ++static inline int skiplist_empty(const struct skiplist_node *head) ++{ ++ return READ_ONCE(head->next[0]) == head; ++} ++ ++/** ++ * skiplist_entry - get the struct for this entry ++ * @ptr: the &struct skiplist_node pointer. ++ * @type: the type of the struct this is embedded in. ++ * @member: the name of the skiplist_node within the struct. ++ */ ++#define skiplist_entry(ptr, type, member) \ ++ container_of(ptr, type, member) ++ ++/** ++ * DEFINE_SKIPLIST_INSERT_FUNC -- macro to define a customized skip list insert ++ * function, which takes two parameters, first one is the header node of the ++ * skip list, second one is the skip list node to be inserted ++ * @func_name: the customized skip list insert function name ++ * @search_func: the search function to be used, which takes two parameters, ++ * 1st one is the itrator of skiplist_node in the list, the 2nd is the skip list ++ * node to be inserted, the function should return true if search should be ++ * continued, otherwise return false. ++ * Returns 1 if @node is inserted as the first item of skip list at level zero, ++ * otherwise 0 ++ */ ++#define DEFINE_SKIPLIST_INSERT_FUNC(func_name, search_func)\ ++static inline int func_name(struct skiplist_node *head, struct skiplist_node *node)\ ++{\ ++ struct skiplist_node *update[NUM_SKIPLIST_LEVEL];\ ++ struct skiplist_node *p, *q;\ ++ int k = head->level;\ ++\ ++ p = head;\ ++ do {\ ++ while (q = p->next[k], q != head && search_func(q, node))\ ++ p = q;\ ++ update[k] = p;\ ++ } while (--k >= 0);\ ++\ ++ k = node->level;\ ++ if (unlikely(k > head->level)) {\ ++ node->level = k = ++head->level;\ ++ update[k] = head;\ ++ }\ ++\ ++ do {\ ++ p = update[k];\ ++ q = p->next[k];\ ++ node->next[k] = q;\ ++ p->next[k] = node;\ ++ node->prev[k] = p;\ ++ q->prev[k] = node;\ ++ } while (--k >= 0);\ ++\ ++ return (p == head);\ ++} ++ ++/** ++ * skiplist_del_init -- delete skip list node from a skip list and reset it's ++ * init state ++ * @head: the header node of the skip list to be deleted from. ++ * @node: the skip list node to be deleted, the caller need to ensure @node is ++ * in skip list which @head represent. ++ * Returns 1 if @node is the first item of skip level at level zero, otherwise 0 ++ */ ++static inline int ++skiplist_del_init(struct skiplist_node *head, struct skiplist_node *node) ++{ ++ int l, m = node->level; ++ ++ for (l = 0; l <= m; l++) { ++ node->prev[l]->next[l] = node->next[l]; ++ node->next[l]->prev[l] = node->prev[l]; ++ } ++ if (m == head->level && m > 0) { ++ while (head->next[m] == head && m > 0) ++ m--; ++ head->level = m; ++ } ++ INIT_SKIPLIST_NODE(node); ++ ++ return (node->prev[0] == head); ++} ++#endif /* _LINUX_SKIP_LIST_H */ +diff --git a/include/uapi/linux/sched.h b/include/uapi/linux/sched.h +index 25b4fa00bad1..fc0aabdce15f 100644 +--- a/include/uapi/linux/sched.h ++++ b/include/uapi/linux/sched.h +@@ -84,7 +84,10 @@ struct clone_args { + #define SCHED_FIFO 1 + #define SCHED_RR 2 + #define SCHED_BATCH 3 +-/* SCHED_ISO: reserved but not implemented yet */ ++/* SCHED_ISO: Implemented in BFS/MuQSSPDS only */ ++#ifdef CONFIG_SCHED_PDS ++#define SCHED_ISO 4 ++#endif + #define SCHED_IDLE 5 + #define SCHED_DEADLINE 6 + +diff --git a/init/Kconfig b/init/Kconfig +index b4daad2bac23..ee3b9957cf3b 100644 +--- a/init/Kconfig ++++ b/init/Kconfig +@@ -73,6 +73,21 @@ config THREAD_INFO_IN_TASK + + menu "General setup" + ++config SCHED_PDS ++ bool "PDS-mq cpu scheduler" ++ help ++ The Priority and Deadline based Skip list multiple queue CPU ++ Scheduler for excellent interactivity and responsiveness on the ++ desktop and solid scalability on normal hardware and commodity ++ servers. ++ ++ Currently incompatible with the Group CPU scheduler, and RCU TORTURE ++ TEST so these options are disabled. ++ ++ Say Y here. ++ default y ++ ++ + config BROKEN + bool + +@@ -802,6 +817,7 @@ config NUMA_BALANCING + depends on ARCH_SUPPORTS_NUMA_BALANCING + depends on !ARCH_WANT_NUMA_VARIABLE_LOCALITY + depends on SMP && NUMA && MIGRATION ++ depends on !SCHED_PDS + help + This option adds support for automatic NUMA aware memory/task placement. + The mechanism is quite primitive and is based on migrating memory when +@@ -903,7 +919,7 @@ menuconfig CGROUP_SCHED + bandwidth allocation to such task groups. It uses cgroups to group + tasks. + +-if CGROUP_SCHED ++if CGROUP_SCHED && !SCHED_PDS + config FAIR_GROUP_SCHED + bool "Group scheduling for SCHED_OTHER" + depends on CGROUP_SCHED +@@ -1032,6 +1048,7 @@ config CGROUP_DEVICE + + config CGROUP_CPUACCT + bool "Simple CPU accounting controller" ++ depends on !SCHED_PDS + help + Provides a simple controller for monitoring the + total CPU consumed by the tasks in a cgroup. +@@ -1150,6 +1167,7 @@ config CHECKPOINT_RESTORE + + config SCHED_AUTOGROUP + bool "Automatic process group scheduling" ++ depends on !SCHED_PDS + select CGROUPS + select CGROUP_SCHED + select FAIR_GROUP_SCHED +diff --git a/init/init_task.c b/init/init_task.c +index 9e5cbe5eab7b..89787e2feb60 100644 +--- a/init/init_task.c ++++ b/init/init_task.c +@@ -58,6 +58,126 @@ struct task_struct init_task + __init_task_data + #endif + = { ++#ifdef CONFIG_SCHED_PDS ++#ifdef CONFIG_THREAD_INFO_IN_TASK ++ .thread_info = INIT_THREAD_INFO(init_task), ++ .stack_refcount = ATOMIC_INIT(1), ++#endif ++ .state = 0, ++ .stack = init_stack, ++ .usage = ATOMIC_INIT(2), ++ .flags = PF_KTHREAD, ++ .prio = NORMAL_PRIO, ++ .static_prio = MAX_PRIO - 20, ++ .normal_prio = NORMAL_PRIO, ++ .deadline = 0, /* PDS only */ ++ .policy = SCHED_NORMAL, ++ .cpus_ptr = &init_task.cpus_mask, ++ .cpus_mask = CPU_MASK_ALL, ++ .nr_cpus_allowed= NR_CPUS, ++ .mm = NULL, ++ .active_mm = &init_mm, ++ .restart_block = { ++ .fn = do_no_restart_syscall, ++ }, ++ .sl_level = 0, /* PDS only */ ++ .sl_node = SKIPLIST_NODE_INIT(init_task.sl_node), /* PDS only */ ++ .time_slice = HZ, /* PDS only */ ++ .tasks = LIST_HEAD_INIT(init_task.tasks), ++#ifdef CONFIG_SMP ++ .pushable_tasks = PLIST_NODE_INIT(init_task.pushable_tasks, MAX_PRIO), ++#endif ++#ifdef CONFIG_CGROUP_SCHED ++ .sched_task_group = &root_task_group, ++#endif ++ .ptraced = LIST_HEAD_INIT(init_task.ptraced), ++ .ptrace_entry = LIST_HEAD_INIT(init_task.ptrace_entry), ++ .real_parent = &init_task, ++ .parent = &init_task, ++ .children = LIST_HEAD_INIT(init_task.children), ++ .sibling = LIST_HEAD_INIT(init_task.sibling), ++ .group_leader = &init_task, ++ RCU_POINTER_INITIALIZER(real_cred, &init_cred), ++ RCU_POINTER_INITIALIZER(cred, &init_cred), ++ .comm = INIT_TASK_COMM, ++ .thread = INIT_THREAD, ++ .fs = &init_fs, ++ .files = &init_files, ++ .signal = &init_signals, ++ .sighand = &init_sighand, ++ .nsproxy = &init_nsproxy, ++ .pending = { ++ .list = LIST_HEAD_INIT(init_task.pending.list), ++ .signal = {{0}} ++ }, ++ .blocked = {{0}}, ++ .alloc_lock = __SPIN_LOCK_UNLOCKED(init_task.alloc_lock), ++ .journal_info = NULL, ++ INIT_CPU_TIMERS(init_task) ++ .pi_lock = __RAW_SPIN_LOCK_UNLOCKED(init_task.pi_lock), ++ .timer_slack_ns = 50000, /* 50 usec default slack */ ++ .thread_pid = &init_struct_pid, ++ .thread_group = LIST_HEAD_INIT(init_task.thread_group), ++ .thread_node = LIST_HEAD_INIT(init_signals.thread_head), ++#ifdef CONFIG_AUDITSYSCALL ++ .loginuid = INVALID_UID, ++ .sessionid = AUDIT_SID_UNSET, ++#endif ++#ifdef CONFIG_PERF_EVENTS ++ .perf_event_mutex = __MUTEX_INITIALIZER(init_task.perf_event_mutex), ++ .perf_event_list = LIST_HEAD_INIT(init_task.perf_event_list), ++#endif ++#ifdef CONFIG_PREEMPT_RCU ++ .rcu_read_lock_nesting = 0, ++ .rcu_read_unlock_special.s = 0, ++ .rcu_node_entry = LIST_HEAD_INIT(init_task.rcu_node_entry), ++ .rcu_blocked_node = NULL, ++#endif ++#ifdef CONFIG_TASKS_RCU ++ .rcu_tasks_holdout = false, ++ .rcu_tasks_holdout_list = LIST_HEAD_INIT(init_task.rcu_tasks_holdout_list), ++ .rcu_tasks_idle_cpu = -1, ++#endif ++#ifdef CONFIG_CPUSETS ++ .mems_allowed_seq = SEQCNT_ZERO(init_task.mems_allowed_seq), ++#endif ++#ifdef CONFIG_RT_MUTEXES ++ .pi_waiters = RB_ROOT_CACHED, ++ .pi_top_task = NULL, ++#endif ++ INIT_PREV_CPUTIME(init_task) ++#ifdef CONFIG_VIRT_CPU_ACCOUNTING_GEN ++ .vtime.seqcount = SEQCNT_ZERO(init_task.vtime_seqcount), ++ .vtime.starttime = 0, ++ .vtime.state = VTIME_SYS, ++#endif ++#ifdef CONFIG_NUMA_BALANCING ++ .numa_preferred_nid = -1, ++ .numa_group = NULL, ++ .numa_faults = NULL, ++#endif ++#ifdef CONFIG_KASAN ++ .kasan_depth = 1, ++#endif ++#ifdef CONFIG_TRACE_IRQFLAGS ++ .softirqs_enabled = 1, ++#endif ++#ifdef CONFIG_LOCKDEP ++ .lockdep_recursion = 0, ++#endif ++#ifdef CONFIG_FUNCTION_GRAPH_TRACER ++ .ret_stack = NULL, ++#endif ++#if defined(CONFIG_TRACING) && defined(CONFIG_PREEMPT) ++ .trace_recursion = 0, ++#endif ++#ifdef CONFIG_LIVEPATCH ++ .patch_state = KLP_UNDEFINED, ++#endif ++#ifdef CONFIG_SECURITY ++ .security = NULL, ++#endif ++#else /* CONFIG_SCHED_PDS */ + #ifdef CONFIG_THREAD_INFO_IN_TASK + .thread_info = INIT_THREAD_INFO(init_task), + .stack_refcount = REFCOUNT_INIT(1), +@@ -181,6 +301,7 @@ struct task_struct init_task + #ifdef CONFIG_SECURITY + .security = NULL, + #endif ++#endif /* CONFIG_SCHED_PDS */ + }; + EXPORT_SYMBOL(init_task); + +diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c +index c87ee6412b36..4045c8532027 100644 +--- a/kernel/cgroup/cpuset.c ++++ b/kernel/cgroup/cpuset.c +@@ -632,7 +632,7 @@ static int validate_change(struct cpuset *cur, struct cpuset *trial) + return ret; + } + +-#ifdef CONFIG_SMP ++#if defined(CONFIG_SMP) && !defined(CONFIG_SCHED_PDS) + /* + * Helper routine for generate_sched_domains(). + * Do cpusets a, b have overlapping effective cpus_allowed masks? +@@ -1007,7 +1007,7 @@ static void rebuild_sched_domains_locked(void) + /* Have scheduler rebuild the domains */ + partition_and_rebuild_sched_domains(ndoms, doms, attr); + } +-#else /* !CONFIG_SMP */ ++#else /* !CONFIG_SMP || CONFIG_SCHED_PDS */ + static void rebuild_sched_domains_locked(void) + { + } +diff --git a/kernel/delayacct.c b/kernel/delayacct.c +index 27725754ac99..769d773c7182 100644 +--- a/kernel/delayacct.c ++++ b/kernel/delayacct.c +@@ -106,7 +106,7 @@ int __delayacct_add_tsk(struct taskstats *d, struct task_struct *tsk) + */ + t1 = tsk->sched_info.pcount; + t2 = tsk->sched_info.run_delay; +- t3 = tsk->se.sum_exec_runtime; ++ t3 = tsk_seruntime(tsk); + + d->cpu_count += t1; + +diff --git a/kernel/exit.c b/kernel/exit.c +index a46a50d67002..58043176b285 100644 +--- a/kernel/exit.c ++++ b/kernel/exit.c +@@ -131,7 +131,7 @@ static void __exit_signal(struct task_struct *tsk) + sig->curr_target = next_thread(tsk); + } + +- add_device_randomness((const void*) &tsk->se.sum_exec_runtime, ++ add_device_randomness((const void*) &tsk_seruntime(tsk), + sizeof(unsigned long long)); + + /* +@@ -152,7 +152,7 @@ static void __exit_signal(struct task_struct *tsk) + sig->inblock += task_io_get_inblock(tsk); + sig->oublock += task_io_get_oublock(tsk); + task_io_accounting_add(&sig->ioac, &tsk->ioac); +- sig->sum_sched_runtime += tsk->se.sum_exec_runtime; ++ sig->sum_sched_runtime += tsk_seruntime(tsk); + sig->nr_threads--; + __unhash_process(tsk, group_dead); + write_sequnlock(&sig->stats_lock); +diff --git a/kernel/livepatch/transition.c b/kernel/livepatch/transition.c +index cdf318d86dd6..baa525865d5c 100644 +--- a/kernel/livepatch/transition.c ++++ b/kernel/livepatch/transition.c +@@ -306,7 +306,11 @@ static bool klp_try_switch_task(struct task_struct *task) + */ + rq = task_rq_lock(task, &flags); + ++#ifdef CONFIG_SCHED_PDS ++ if (task_running(task) && task != current) { ++#else + if (task_running(rq, task) && task != current) { ++#endif + snprintf(err_buf, STACK_ERR_BUF_SIZE, + "%s: %s:%d is running\n", __func__, task->comm, + task->pid); +diff --git a/kernel/locking/rtmutex.c b/kernel/locking/rtmutex.c +index 2874bf556162..fad8a279fdfa 100644 +--- a/kernel/locking/rtmutex.c ++++ b/kernel/locking/rtmutex.c +@@ -229,7 +229,7 @@ static inline bool unlock_rt_mutex_safe(struct rt_mutex *lock, + * Only use with rt_mutex_waiter_{less,equal}() + */ + #define task_to_waiter(p) \ +- &(struct rt_mutex_waiter){ .prio = (p)->prio, .deadline = (p)->dl.deadline } ++ &(struct rt_mutex_waiter){ .prio = (p)->prio, .deadline = __tsk_deadline(p) } + + static inline int + rt_mutex_waiter_less(struct rt_mutex_waiter *left, +@@ -680,7 +680,7 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task, + * the values of the node being removed. + */ + waiter->prio = task->prio; +- waiter->deadline = task->dl.deadline; ++ waiter->deadline = __tsk_deadline(task); + + rt_mutex_enqueue(lock, waiter); + +@@ -953,7 +953,7 @@ static int task_blocks_on_rt_mutex(struct rt_mutex *lock, + waiter->task = task; + waiter->lock = lock; + waiter->prio = task->prio; +- waiter->deadline = task->dl.deadline; ++ waiter->deadline = __tsk_deadline(task); + + /* Get the top priority waiter on the lock */ + if (rt_mutex_has_waiters(lock)) +diff --git a/kernel/sched/Makefile b/kernel/sched/Makefile +index 21fb5a5662b5..8ebe4e33fb5f 100644 +--- a/kernel/sched/Makefile ++++ b/kernel/sched/Makefile +@@ -16,15 +16,21 @@ ifneq ($(CONFIG_SCHED_OMIT_FRAME_POINTER),y) + CFLAGS_core.o := $(PROFILING) -fno-omit-frame-pointer + endif + +-obj-y += core.o loadavg.o clock.o cputime.o +-obj-y += idle.o fair.o rt.o deadline.o +-obj-y += wait.o wait_bit.o swait.o completion.o +- +-obj-$(CONFIG_SMP) += cpupri.o cpudeadline.o topology.o stop_task.o pelt.o ++ifdef CONFIG_SCHED_PDS ++obj-y += pds.o ++else ++obj-y += core.o ++obj-y += fair.o rt.o deadline.o ++obj-$(CONFIG_SMP) += cpudeadline.o topology.o stop_task.o + obj-$(CONFIG_SCHED_AUTOGROUP) += autogroup.o +-obj-$(CONFIG_SCHEDSTATS) += stats.o + obj-$(CONFIG_SCHED_DEBUG) += debug.o + obj-$(CONFIG_CGROUP_CPUACCT) += cpuacct.o ++endif ++obj-y += loadavg.o clock.o cputime.o ++obj-y += idle.o ++obj-y += wait.o wait_bit.o swait.o completion.o ++obj-$(CONFIG_SMP) += cpupri.o pelt.o ++obj-$(CONFIG_SCHEDSTATS) += stats.o + obj-$(CONFIG_CPU_FREQ) += cpufreq.o + obj-$(CONFIG_CPU_FREQ_GOV_SCHEDUTIL) += cpufreq_schedutil.o + obj-$(CONFIG_MEMBARRIER) += membarrier.o +diff --git a/kernel/sched/cpufreq_schedutil.c b/kernel/sched/cpufreq_schedutil.c +index 86800b4d5453..07f278dc3137 100644 +--- a/kernel/sched/cpufreq_schedutil.c ++++ b/kernel/sched/cpufreq_schedutil.c +@@ -185,6 +185,7 @@ static unsigned int get_next_freq(struct sugov_policy *sg_policy, + return cpufreq_driver_resolve_freq(policy, freq); + } + ++#ifndef CONFIG_SCHED_PDS + /* + * This function computes an effective utilization for the given CPU, to be + * used for frequency selection given the linear relation: f = u * f_max. +@@ -302,6 +303,13 @@ static unsigned long sugov_get_util(struct sugov_cpu *sg_cpu) + + return schedutil_cpu_util(sg_cpu->cpu, util, max, FREQUENCY_UTIL, NULL); + } ++#else /* CONFIG_SCHED_PDS */ ++static unsigned long sugov_get_util(struct sugov_cpu *sg_cpu) ++{ ++ sg_cpu->max = arch_scale_cpu_capacity(sg_cpu->cpu); ++ return sg_cpu->max; ++} ++#endif + + /** + * sugov_iowait_reset() - Reset the IO boost status of a CPU. +@@ -445,7 +453,9 @@ static inline bool sugov_cpu_is_busy(struct sugov_cpu *sg_cpu) { return false; } + */ + static inline void ignore_dl_rate_limit(struct sugov_cpu *sg_cpu, struct sugov_policy *sg_policy) + { ++#ifndef CONFIG_SCHED_PDS + if (cpu_bw_dl(cpu_rq(sg_cpu->cpu)) > sg_cpu->bw_dl) ++#endif + sg_policy->limits_changed = true; + } + +@@ -688,6 +698,7 @@ static int sugov_kthread_create(struct sugov_policy *sg_policy) + } + + ret = sched_setattr_nocheck(thread, &attr); ++ + if (ret) { + kthread_stop(thread); + pr_warn("%s: failed to set SCHED_DEADLINE\n", __func__); +@@ -918,6 +929,7 @@ static int __init sugov_register(void) + fs_initcall(sugov_register); + + #ifdef CONFIG_ENERGY_MODEL ++#ifndef CONFIG_SCHED_PDS + extern bool sched_energy_update; + extern struct mutex sched_energy_mutex; + +@@ -948,4 +960,10 @@ void sched_cpufreq_governor_change(struct cpufreq_policy *policy, + } + + } ++#else /* CONFIG_SCHED_PDS */ ++void sched_cpufreq_governor_change(struct cpufreq_policy *policy, ++ struct cpufreq_governor *old_gov) ++{ ++} ++#endif + #endif +diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c +index 46ed4e1383e2..0a9548ee995c 100644 +--- a/kernel/sched/cputime.c ++++ b/kernel/sched/cputime.c +@@ -122,7 +122,12 @@ void account_user_time(struct task_struct *p, u64 cputime) + p->utime += cputime; + account_group_user_time(p, cputime); + ++#ifdef CONFIG_SCHED_PDS ++ index = (task_nice(p) > 0 || task_running_idle(p)) ? CPUTIME_NICE : ++ CPUTIME_USER; ++#else + index = (task_nice(p) > 0) ? CPUTIME_NICE : CPUTIME_USER; ++#endif + + /* Add user time to cpustat. */ + task_group_account_field(p, index, cputime); +@@ -146,7 +151,11 @@ void account_guest_time(struct task_struct *p, u64 cputime) + p->gtime += cputime; + + /* Add guest time to cpustat. */ ++#ifdef CONFIG_SCHED_PDS ++ if (task_nice(p) > 0 || task_running_idle(p)) { ++#else + if (task_nice(p) > 0) { ++#endif + cpustat[CPUTIME_NICE] += cputime; + cpustat[CPUTIME_GUEST_NICE] += cputime; + } else { +@@ -269,7 +278,7 @@ static inline u64 account_other_time(u64 max) + #ifdef CONFIG_64BIT + static inline u64 read_sum_exec_runtime(struct task_struct *t) + { +- return t->se.sum_exec_runtime; ++ return tsk_seruntime(t); + } + #else + static u64 read_sum_exec_runtime(struct task_struct *t) +@@ -279,7 +288,7 @@ static u64 read_sum_exec_runtime(struct task_struct *t) + struct rq *rq; + + rq = task_rq_lock(t, &rf); +- ns = t->se.sum_exec_runtime; ++ ns = tsk_seruntime(t); + task_rq_unlock(rq, t, &rf); + + return ns; +@@ -663,7 +672,7 @@ void cputime_adjust(struct task_cputime *curr, struct prev_cputime *prev, + void task_cputime_adjusted(struct task_struct *p, u64 *ut, u64 *st) + { + struct task_cputime cputime = { +- .sum_exec_runtime = p->se.sum_exec_runtime, ++ .sum_exec_runtime = tsk_seruntime(p), + }; + + task_cputime(p, &cputime.utime, &cputime.stime); +diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c +index f65ef1e2f204..454fa7e460e3 100644 +--- a/kernel/sched/idle.c ++++ b/kernel/sched/idle.c +@@ -355,6 +355,7 @@ void cpu_startup_entry(enum cpuhp_state state) + do_idle(); + } + ++#ifndef CONFIG_SCHED_PDS + /* + * idle-task scheduling class. + */ +@@ -479,3 +480,4 @@ const struct sched_class idle_sched_class = { + .switched_to = switched_to_idle, + .update_curr = update_curr_idle, + }; ++#endif +diff --git a/kernel/sched/pds.c b/kernel/sched/pds.c +new file mode 100644 +index 000000000000..aefbd9cebcfb +--- /dev/null ++++ b/kernel/sched/pds.c +@@ -0,0 +1,6541 @@ ++/* ++ * kernel/sched/pds.c, was kernel/sched.c ++ * ++ * PDS-mq Core kernel scheduler code and related syscalls ++ * ++ * Copyright (C) 1991-2002 Linus Torvalds ++ * ++ * 2009-08-13 Brainfuck deadline scheduling policy by Con Kolivas deletes ++ * a whole lot of those previous things. ++ * 2017-09-06 Priority and Deadline based Skip list multiple queue kernel ++ * scheduler by Alfred Chen. ++ */ ++#include "pds_sched.h" ++ ++#include ++ ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++ ++#include ++ ++#include ++ ++#include "../workqueue_internal.h" ++#include "../../fs/io-wq.h" ++#include "../smpboot.h" ++ ++#include "pelt.h" ++ ++#define CREATE_TRACE_POINTS ++#include ++ ++ ++#define rt_prio(prio) ((prio) < MAX_RT_PRIO) ++#define rt_task(p) rt_prio((p)->prio) ++#define rt_policy(policy) ((policy) == SCHED_FIFO || \ ++ (policy) == SCHED_RR || \ ++ (policy) == SCHED_ISO) ++#define task_has_rt_policy(p) (rt_policy((p)->policy)) ++ ++#define idle_policy(policy) ((policy) == SCHED_IDLE) ++#define idleprio_task(p) unlikely(idle_policy((p)->policy)) ++ ++#define STOP_PRIO (MAX_RT_PRIO - 1) ++ ++/* ++ * Some helpers for converting to/from various scales. Use shifts to get ++ * approximate multiples of ten for less overhead. ++ */ ++#define JIFFIES_TO_NS(TIME) ((TIME) * (1000000000 / HZ)) ++#define JIFFY_NS (1000000000 / HZ) ++#define HALF_JIFFY_NS (1000000000 / HZ / 2) ++#define HALF_JIFFY_US (1000000 / HZ / 2) ++#define MS_TO_NS(TIME) ((TIME) << 20) ++#define MS_TO_US(TIME) ((TIME) << 10) ++#define NS_TO_MS(TIME) ((TIME) >> 20) ++#define NS_TO_US(TIME) ((TIME) >> 10) ++#define US_TO_NS(TIME) ((TIME) << 10) ++ ++#define RESCHED_US (100) /* Reschedule if less than this many μs left */ ++ ++enum { ++ BASE_CPU_AFFINITY_CHK_LEVEL = 1, ++#ifdef CONFIG_SCHED_SMT ++ SMT_CPU_AFFINITY_CHK_LEVEL_SPACE_HOLDER, ++#endif ++#ifdef CONFIG_SCHED_MC ++ MC_CPU_AFFINITY_CHK_LEVEL_SPACE_HOLDER, ++#endif ++ NR_CPU_AFFINITY_CHK_LEVEL ++}; ++ ++static inline void print_scheduler_version(void) ++{ ++ printk(KERN_INFO "pds: PDS-mq CPU Scheduler 0.99o by Alfred Chen and kept alive artificially by Tk-Glitch.\n"); ++} ++ ++/* ++ * This is the time all tasks within the same priority round robin. ++ * Value is in ms and set to a minimum of 6ms. Scales with number of cpus. ++ * Tunable via /proc interface. ++ */ ++#define SCHED_DEFAULT_RR (4) ++int rr_interval __read_mostly = SCHED_DEFAULT_RR; ++ ++static int __init rr_interval_set(char *str) ++{ ++ u32 rr; ++ ++ pr_info("rr_interval: "); ++ if (kstrtouint(str, 0, &rr)) { ++ pr_cont("using default of %u, unable to parse %s\n", ++ rr_interval, str); ++ return 1; ++ } ++ ++ rr_interval = rr; ++ pr_cont("%d\n", rr_interval); ++ ++ return 1; ++} ++__setup("rr_interval=", rr_interval_set); ++ ++ ++static const u64 sched_prio2deadline[NICE_WIDTH] = { ++/* -20 */ 6291456, 6920601, 7612661, 8373927, 9211319, ++/* -15 */ 10132450, 11145695, 12260264, 13486290, 14834919, ++/* -10 */ 16318410, 17950251, 19745276, 21719803, 23891783, ++/* -5 */ 26280961, 28909057, 31799962, 34979958, 38477953, ++/* 0 */ 42325748, 46558322, 51214154, 56335569, 61969125, ++/* 5 */ 68166037, 74982640, 82480904, 90728994, 99801893, ++/* 10 */ 109782082, 120760290, 132836319, 146119950, 160731945, ++/* 15 */ 176805139, 194485652, 213934217, 235327638, 258860401 ++}; ++ ++/** ++ * sched_yield_type - Choose what sort of yield sched_yield will perform. ++ * 0: No yield. ++ * 1: Yield only to better priority/deadline tasks. (default) ++ * 2: Expire timeslice and recalculate deadline. ++ */ ++int sched_yield_type __read_mostly = 1; ++ ++/* ++ * The quota handed out to tasks of all priority levels when refilling their ++ * time_slice. ++ */ ++static inline int timeslice(void) ++{ ++ return MS_TO_US(rr_interval); ++} ++ ++#ifdef CONFIG_SMP ++enum { ++SCHED_RQ_EMPTY = 0, ++SCHED_RQ_IDLE, ++SCHED_RQ_NORMAL_0, ++SCHED_RQ_NORMAL_1, ++SCHED_RQ_NORMAL_2, ++SCHED_RQ_NORMAL_3, ++SCHED_RQ_NORMAL_4, ++SCHED_RQ_NORMAL_5, ++SCHED_RQ_NORMAL_6, ++SCHED_RQ_NORMAL_7, ++SCHED_RQ_ISO, ++SCHED_RQ_RT, ++NR_SCHED_RQ_QUEUED_LEVEL ++}; ++ ++static cpumask_t sched_rq_queued_masks[NR_SCHED_RQ_QUEUED_LEVEL] ++____cacheline_aligned_in_smp; ++ ++static DECLARE_BITMAP(sched_rq_queued_masks_bitmap, NR_SCHED_RQ_QUEUED_LEVEL) ++____cacheline_aligned_in_smp; ++ ++static cpumask_t sched_rq_pending_masks[NR_SCHED_RQ_QUEUED_LEVEL] ++____cacheline_aligned_in_smp; ++ ++static DECLARE_BITMAP(sched_rq_pending_masks_bitmap, NR_SCHED_RQ_QUEUED_LEVEL) ++____cacheline_aligned_in_smp; ++ ++DEFINE_PER_CPU(cpumask_t [NR_CPU_AFFINITY_CHK_LEVEL], sched_cpu_affinity_chk_masks); ++DEFINE_PER_CPU(cpumask_t *, sched_cpu_llc_start_mask); ++DEFINE_PER_CPU(cpumask_t *, sched_cpu_affinity_chk_end_masks); ++ ++#ifdef CONFIG_SCHED_SMT ++DEFINE_PER_CPU(int, sched_sibling_cpu); ++DEFINE_STATIC_KEY_FALSE(sched_smt_present); ++EXPORT_SYMBOL_GPL(sched_smt_present); ++ ++static cpumask_t sched_cpu_sg_idle_mask ____cacheline_aligned_in_smp; ++ ++#ifdef CONFIG_SMT_NICE ++/* ++ * Preemptible sibling group mask ++ * Which all sibling cpus are running at PRIO_LIMIT or IDLE_PRIO ++ */ ++static cpumask_t sched_cpu_psg_mask ____cacheline_aligned_in_smp; ++/* ++ * SMT supressed mask ++ * When a cpu is running task with NORMAL/ISO/RT policy, its sibling cpu ++ * will be supressed to run IDLE priority task. ++ */ ++static cpumask_t sched_smt_supressed_mask ____cacheline_aligned_in_smp; ++#endif /* CONFIG_SMT_NICE */ ++#endif ++ ++static int sched_rq_prio[NR_CPUS] ____cacheline_aligned; ++ ++/* ++ * Keep a unique ID per domain (we use the first CPUs number in the cpumask of ++ * the domain), this allows us to quickly tell if two cpus are in the same cache ++ * domain, see cpus_share_cache(). ++ */ ++DEFINE_PER_CPU(int, sd_llc_id); ++ ++int __weak arch_sd_sibling_asym_packing(void) ++{ ++ return 0*SD_ASYM_PACKING; ++} ++#else ++struct rq *uprq; ++#endif /* CONFIG_SMP */ ++ ++static DEFINE_MUTEX(sched_hotcpu_mutex); ++ ++DEFINE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues); ++ ++#ifndef prepare_arch_switch ++# define prepare_arch_switch(next) do { } while (0) ++#endif ++#ifndef finish_arch_post_lock_switch ++# define finish_arch_post_lock_switch() do { } while (0) ++#endif ++ ++/* ++ * Context: p->pi_lock ++ */ ++static inline struct rq ++*__task_access_lock(struct task_struct *p, raw_spinlock_t **plock) ++{ ++ struct rq *rq; ++ for (;;) { ++ rq = task_rq(p); ++ if (p->on_cpu || task_on_rq_queued(p)) { ++ raw_spin_lock(&rq->lock); ++ if (likely((p->on_cpu || task_on_rq_queued(p)) ++ && rq == task_rq(p))) { ++ *plock = &rq->lock; ++ return rq; ++ } ++ raw_spin_unlock(&rq->lock); ++ } else if (task_on_rq_migrating(p)) { ++ do { ++ cpu_relax(); ++ } while (unlikely(task_on_rq_migrating(p))); ++ } else { ++ *plock = NULL; ++ return rq; ++ } ++ } ++} ++ ++static inline void ++__task_access_unlock(struct task_struct *p, raw_spinlock_t *lock) ++{ ++ if (NULL != lock) ++ raw_spin_unlock(lock); ++} ++ ++static inline struct rq ++*task_access_lock_irqsave(struct task_struct *p, raw_spinlock_t **plock, ++ unsigned long *flags) ++{ ++ struct rq *rq; ++ for (;;) { ++ rq = task_rq(p); ++ if (p->on_cpu || task_on_rq_queued(p)) { ++ raw_spin_lock_irqsave(&rq->lock, *flags); ++ if (likely((p->on_cpu || task_on_rq_queued(p)) ++ && rq == task_rq(p))) { ++ *plock = &rq->lock; ++ return rq; ++ } ++ raw_spin_unlock_irqrestore(&rq->lock, *flags); ++ } else if (task_on_rq_migrating(p)) { ++ do { ++ cpu_relax(); ++ } while (unlikely(task_on_rq_migrating(p))); ++ } else { ++ raw_spin_lock_irqsave(&p->pi_lock, *flags); ++ if (likely(!p->on_cpu && !p->on_rq && ++ rq == task_rq(p))) { ++ *plock = &p->pi_lock; ++ return rq; ++ } ++ raw_spin_unlock_irqrestore(&p->pi_lock, *flags); ++ } ++ } ++} ++ ++static inline void ++task_access_unlock_irqrestore(struct task_struct *p, raw_spinlock_t *lock, ++ unsigned long *flags) ++{ ++ raw_spin_unlock_irqrestore(lock, *flags); ++} ++ ++/* ++ * __task_rq_lock - lock the rq @p resides on. ++ */ ++struct rq *__task_rq_lock(struct task_struct *p, struct rq_flags *rf) ++ __acquires(rq->lock) ++{ ++ struct rq *rq; ++ ++ lockdep_assert_held(&p->pi_lock); ++ ++ for (;;) { ++ rq = task_rq(p); ++ raw_spin_lock(&rq->lock); ++ if (likely(rq == task_rq(p) && !task_on_rq_migrating(p))) ++ return rq; ++ raw_spin_unlock(&rq->lock); ++ ++ while (unlikely(task_on_rq_migrating(p))) ++ cpu_relax(); ++ } ++} ++ ++/* ++ * task_rq_lock - lock p->pi_lock and lock the rq @p resides on. ++ */ ++struct rq *task_rq_lock(struct task_struct *p, struct rq_flags *rf) ++ __acquires(p->pi_lock) ++ __acquires(rq->lock) ++{ ++ struct rq *rq; ++ ++ for (;;) { ++ raw_spin_lock_irqsave(&p->pi_lock, rf->flags); ++ rq = task_rq(p); ++ raw_spin_lock(&rq->lock); ++ /* ++ * move_queued_task() task_rq_lock() ++ * ++ * ACQUIRE (rq->lock) ++ * [S] ->on_rq = MIGRATING [L] rq = task_rq() ++ * WMB (__set_task_cpu()) ACQUIRE (rq->lock); ++ * [S] ->cpu = new_cpu [L] task_rq() ++ * [L] ->on_rq ++ * RELEASE (rq->lock) ++ * ++ * If we observe the old CPU in task_rq_lock(), the acquire of ++ * the old rq->lock will fully serialize against the stores. ++ * ++ * If we observe the new CPU in task_rq_lock(), the address ++ * dependency headed by '[L] rq = task_rq()' and the acquire ++ * will pair with the WMB to ensure we then also see migrating. ++ */ ++ if (likely(rq == task_rq(p) && !task_on_rq_migrating(p))) { ++ return rq; ++ } ++ raw_spin_unlock(&rq->lock); ++ raw_spin_unlock_irqrestore(&p->pi_lock, rf->flags); ++ ++ while (unlikely(task_on_rq_migrating(p))) ++ cpu_relax(); ++ } ++} ++ ++/* ++ * RQ-clock updating methods: ++ */ ++ ++static void update_rq_clock_task(struct rq *rq, s64 delta) ++{ ++/* ++ * In theory, the compile should just see 0 here, and optimize out the call ++ * to sched_rt_avg_update. But I don't trust it... ++ */ ++ s64 __maybe_unused steal = 0, irq_delta = 0; ++ ++#ifdef CONFIG_IRQ_TIME_ACCOUNTING ++ irq_delta = irq_time_read(cpu_of(rq)) - rq->prev_irq_time; ++ ++ /* ++ * Since irq_time is only updated on {soft,}irq_exit, we might run into ++ * this case when a previous update_rq_clock() happened inside a ++ * {soft,}irq region. ++ * ++ * When this happens, we stop ->clock_task and only update the ++ * prev_irq_time stamp to account for the part that fit, so that a next ++ * update will consume the rest. This ensures ->clock_task is ++ * monotonic. ++ * ++ * It does however cause some slight miss-attribution of {soft,}irq ++ * time, a more accurate solution would be to update the irq_time using ++ * the current rq->clock timestamp, except that would require using ++ * atomic ops. ++ */ ++ if (irq_delta > delta) ++ irq_delta = delta; ++ ++ rq->prev_irq_time += irq_delta; ++ delta -= irq_delta; ++#endif ++#ifdef CONFIG_PARAVIRT_TIME_ACCOUNTING ++ if (static_key_false((¶virt_steal_rq_enabled))) { ++ steal = paravirt_steal_clock(cpu_of(rq)); ++ steal -= rq->prev_steal_time_rq; ++ ++ if (unlikely(steal > delta)) ++ steal = delta; ++ ++ rq->prev_steal_time_rq += steal; ++ ++ delta -= steal; ++ } ++#endif ++ ++ rq->clock_task += delta; ++ ++#ifdef CONFIG_HAVE_SCHED_AVG_IRQ ++ if ((irq_delta + steal)) ++ update_irq_load_avg(rq, irq_delta + steal); ++#endif ++} ++ ++static inline void update_rq_clock(struct rq *rq) ++{ ++ s64 delta = sched_clock_cpu(cpu_of(rq)) - rq->clock; ++ ++ if (unlikely(delta <= 0)) ++ return; ++ rq->clock += delta; ++ update_rq_clock_task(rq, delta); ++} ++ ++static inline void update_task_priodl(struct task_struct *p) ++{ ++ p->priodl = (((u64) (p->prio))<<56) | ((p->deadline)>>8); ++} ++ ++/* ++ * Deadline is "now" in niffies + (offset by priority). Setting the deadline ++ * is the key to everything. It distributes CPU fairly amongst tasks of the ++ * same nice value, it proportions CPU according to nice level, it means the ++ * task that last woke up the longest ago has the earliest deadline, thus ++ * ensuring that interactive tasks get low latency on wake up. The CPU ++ * proportion works out to the square of the virtual deadline difference, so ++ * this equation will give nice 19 3% CPU compared to nice 0. ++ */ ++static inline u64 task_deadline_diff(const struct task_struct *p) ++{ ++ return sched_prio2deadline[TASK_USER_PRIO(p)]; ++} ++ ++static inline u64 static_deadline_diff(int static_prio) ++{ ++ return sched_prio2deadline[USER_PRIO(static_prio)]; ++} ++ ++/* ++ * The time_slice is only refilled when it is empty and that is when we set a ++ * new deadline for non-rt tasks. ++ */ ++static inline void time_slice_expired(struct task_struct *p, struct rq *rq) ++{ ++ p->time_slice = timeslice(); ++ if (p->prio >= NORMAL_PRIO) ++ p->deadline = rq->clock + task_deadline_diff(p); ++ ++ update_task_priodl(p); ++} ++ ++static inline struct task_struct *rq_first_queued_task(struct rq *rq) ++{ ++ struct skiplist_node *node = rq->sl_header.next[0]; ++ ++ if (node == &rq->sl_header) ++ return rq->idle; ++ ++ return skiplist_entry(node, struct task_struct, sl_node); ++} ++ ++static inline struct task_struct *rq_second_queued_task(struct rq *rq) ++{ ++ struct skiplist_node *node = rq->sl_header.next[0]->next[0]; ++ ++ if (node == &rq->sl_header) ++ return rq->idle; ++ ++ return skiplist_entry(node, struct task_struct, sl_node); ++} ++ ++static inline int is_second_in_rq(struct task_struct *p, struct rq *rq) ++{ ++ return (p->sl_node.prev[0]->prev[0] == &rq->sl_header); ++} ++ ++static const int task_dl_hash_tbl[] = { ++/* 0 4 8 12 */ ++ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, ++/* 16 20 24 28 */ ++ 1, 1, 1, 2, 2, 2, 2, 2, 3, 3, 3, 4, 4, 5, 6, 7 ++}; ++ ++static inline int ++task_deadline_level(const struct task_struct *p, const struct rq *rq) ++{ ++ u64 delta = (rq->clock + sched_prio2deadline[39] - p->deadline) >> 23; ++ ++ delta = min((size_t)delta, ARRAY_SIZE(task_dl_hash_tbl) - 1); ++ return task_dl_hash_tbl[delta]; ++} ++ ++/* ++ * cmpxchg based fetch_or, macro so it works for different integer types ++ */ ++#define fetch_or(ptr, mask) \ ++ ({ \ ++ typeof(ptr) _ptr = (ptr); \ ++ typeof(mask) _mask = (mask); \ ++ typeof(*_ptr) _old, _val = *_ptr; \ ++ \ ++ for (;;) { \ ++ _old = cmpxchg(_ptr, _val, _val | _mask); \ ++ if (_old == _val) \ ++ break; \ ++ _val = _old; \ ++ } \ ++ _old; \ ++}) ++ ++#if defined(CONFIG_SMP) && defined(TIF_POLLING_NRFLAG) ++/* ++ * Atomically set TIF_NEED_RESCHED and test for TIF_POLLING_NRFLAG, ++ * this avoids any races wrt polling state changes and thereby avoids ++ * spurious IPIs. ++ */ ++static bool set_nr_and_not_polling(struct task_struct *p) ++{ ++ struct thread_info *ti = task_thread_info(p); ++ return !(fetch_or(&ti->flags, _TIF_NEED_RESCHED) & _TIF_POLLING_NRFLAG); ++} ++ ++/* ++ * Atomically set TIF_NEED_RESCHED if TIF_POLLING_NRFLAG is set. ++ * ++ * If this returns true, then the idle task promises to call ++ * sched_ttwu_pending() and reschedule soon. ++ */ ++static bool set_nr_if_polling(struct task_struct *p) ++{ ++ struct thread_info *ti = task_thread_info(p); ++ typeof(ti->flags) old, val = READ_ONCE(ti->flags); ++ ++ for (;;) { ++ if (!(val & _TIF_POLLING_NRFLAG)) ++ return false; ++ if (val & _TIF_NEED_RESCHED) ++ return true; ++ old = cmpxchg(&ti->flags, val, val | _TIF_NEED_RESCHED); ++ if (old == val) ++ break; ++ val = old; ++ } ++ return true; ++} ++ ++#else ++static bool set_nr_and_not_polling(struct task_struct *p) ++{ ++ set_tsk_need_resched(p); ++ return true; ++} ++ ++#ifdef CONFIG_SMP ++static bool set_nr_if_polling(struct task_struct *p) ++{ ++ return false; ++} ++#endif ++#endif ++ ++#ifdef CONFIG_SMP ++#ifdef CONFIG_SMT_NICE ++static void resched_cpu_if_curr_is(int cpu, int priority) ++{ ++ struct rq *rq = cpu_rq(cpu); ++ ++ rcu_read_lock(); ++ ++ if (rcu_dereference(rq->curr)->prio != priority) ++ goto out; ++ ++ if (set_nr_if_polling(rq->idle)) { ++ trace_sched_wake_idle_without_ipi(cpu); ++ } else { ++ if (!do_raw_spin_trylock(&rq->lock)) ++ goto out; ++ spin_acquire(&rq->lock.dep_map, SINGLE_DEPTH_NESTING, 1, _RET_IP_); ++ ++ if (priority == rq->curr->prio) ++ smp_send_reschedule(cpu); ++ /* Else CPU is not idle, do nothing here */ ++ ++ spin_release(&rq->lock.dep_map, _RET_IP_); ++ do_raw_spin_unlock(&rq->lock); ++ } ++ ++out: ++ rcu_read_unlock(); ++} ++#endif /* CONFIG_SMT_NICE */ ++ ++static inline bool ++__update_cpumasks_bitmap(int cpu, unsigned long *plevel, unsigned long level, ++ cpumask_t cpumasks[], unsigned long bitmap[]) ++{ ++ if (*plevel == level) ++ return false; ++ ++ cpumask_clear_cpu(cpu, cpumasks + *plevel); ++ if (cpumask_empty(cpumasks + *plevel)) ++ clear_bit(*plevel, bitmap); ++ cpumask_set_cpu(cpu, cpumasks + level); ++ set_bit(level, bitmap); ++ ++ *plevel = level; ++ ++ return true; ++} ++ ++static inline int ++task_running_policy_level(const struct task_struct *p, const struct rq *rq) ++{ ++ int prio = p->prio; ++ ++ if (NORMAL_PRIO == prio) ++ return SCHED_RQ_NORMAL_0 + task_deadline_level(p, rq); ++ ++ if (ISO_PRIO == prio) ++ return SCHED_RQ_ISO; ++ if (prio < MAX_RT_PRIO) ++ return SCHED_RQ_RT; ++ return PRIO_LIMIT - prio; ++} ++ ++static inline void update_sched_rq_queued_masks_normal(struct rq *rq) ++{ ++ struct task_struct *p = rq_first_queued_task(rq); ++ ++ if (p->prio != NORMAL_PRIO) ++ return; ++ ++ __update_cpumasks_bitmap(cpu_of(rq), &rq->queued_level, ++ task_running_policy_level(p, rq), ++ &sched_rq_queued_masks[0], ++ &sched_rq_queued_masks_bitmap[0]); ++} ++ ++#ifdef CONFIG_SMT_NICE ++static inline void update_sched_cpu_psg_mask(const int cpu) ++{ ++ cpumask_t tmp; ++ ++ cpumask_or(&tmp, &sched_rq_queued_masks[SCHED_RQ_EMPTY], ++ &sched_rq_queued_masks[SCHED_RQ_IDLE]); ++ cpumask_and(&tmp, &tmp, cpu_smt_mask(cpu)); ++ if (cpumask_equal(&tmp, cpu_smt_mask(cpu))) ++ cpumask_or(&sched_cpu_psg_mask, &sched_cpu_psg_mask, ++ cpu_smt_mask(cpu)); ++ else ++ cpumask_andnot(&sched_cpu_psg_mask, &sched_cpu_psg_mask, ++ cpu_smt_mask(cpu)); ++} ++#endif ++ ++static inline void update_sched_rq_queued_masks(struct rq *rq) ++{ ++ int cpu = cpu_of(rq); ++ struct task_struct *p = rq_first_queued_task(rq); ++ unsigned long level; ++#ifdef CONFIG_SCHED_SMT ++ unsigned long last_level = rq->queued_level; ++#endif ++ ++ level = task_running_policy_level(p, rq); ++ sched_rq_prio[cpu] = p->prio; ++ ++ if (!__update_cpumasks_bitmap(cpu, &rq->queued_level, level, ++ &sched_rq_queued_masks[0], ++ &sched_rq_queued_masks_bitmap[0])) ++ return; ++ ++#ifdef CONFIG_SCHED_SMT ++ if (cpu == per_cpu(sched_sibling_cpu, cpu)) ++ return; ++ ++ if (SCHED_RQ_EMPTY == last_level) { ++ cpumask_andnot(&sched_cpu_sg_idle_mask, &sched_cpu_sg_idle_mask, ++ cpu_smt_mask(cpu)); ++ } else if (SCHED_RQ_EMPTY == level) { ++ cpumask_t tmp; ++ ++ cpumask_and(&tmp, cpu_smt_mask(cpu), ++ &sched_rq_queued_masks[SCHED_RQ_EMPTY]); ++ if (cpumask_equal(&tmp, cpu_smt_mask(cpu))) ++ cpumask_or(&sched_cpu_sg_idle_mask, cpu_smt_mask(cpu), ++ &sched_cpu_sg_idle_mask); ++ } ++ ++#ifdef CONFIG_SMT_NICE ++ if (level <= SCHED_RQ_IDLE && last_level > SCHED_RQ_IDLE) { ++ cpumask_clear_cpu(per_cpu(sched_sibling_cpu, cpu), ++ &sched_smt_supressed_mask); ++ update_sched_cpu_psg_mask(cpu); ++ resched_cpu_if_curr_is(per_cpu(sched_sibling_cpu, cpu), PRIO_LIMIT); ++ } else if (last_level <= SCHED_RQ_IDLE && level > SCHED_RQ_IDLE) { ++ cpumask_set_cpu(per_cpu(sched_sibling_cpu, cpu), ++ &sched_smt_supressed_mask); ++ update_sched_cpu_psg_mask(cpu); ++ resched_cpu_if_curr_is(per_cpu(sched_sibling_cpu, cpu), IDLE_PRIO); ++ } ++#endif /* CONFIG_SMT_NICE */ ++#endif ++} ++ ++static inline void update_sched_rq_pending_masks(struct rq *rq) ++{ ++ unsigned long level; ++ struct task_struct *p = rq_second_queued_task(rq); ++ ++ level = task_running_policy_level(p, rq); ++ ++ __update_cpumasks_bitmap(cpu_of(rq), &rq->pending_level, level, ++ &sched_rq_pending_masks[0], ++ &sched_rq_pending_masks_bitmap[0]); ++} ++ ++#else /* CONFIG_SMP */ ++static inline void update_sched_rq_queued_masks(struct rq *rq) {} ++static inline void update_sched_rq_queued_masks_normal(struct rq *rq) {} ++static inline void update_sched_rq_pending_masks(struct rq *rq) {} ++#endif ++ ++#ifdef CONFIG_NO_HZ_FULL ++/* ++ * Tick may be needed by tasks in the runqueue depending on their policy and ++ * requirements. If tick is needed, lets send the target an IPI to kick it out ++ * of nohz mode if necessary. ++ */ ++static inline void sched_update_tick_dependency(struct rq *rq) ++{ ++ int cpu; ++ ++ if (!tick_nohz_full_enabled()) ++ return; ++ ++ cpu = cpu_of(rq); ++ ++ if (!tick_nohz_full_cpu(cpu)) ++ return; ++ ++ if (rq->nr_running < 2) ++ tick_nohz_dep_clear_cpu(cpu, TICK_DEP_BIT_SCHED); ++ else ++ tick_nohz_dep_set_cpu(cpu, TICK_DEP_BIT_SCHED); ++} ++#else /* !CONFIG_NO_HZ_FULL */ ++static inline void sched_update_tick_dependency(struct rq *rq) { } ++#endif ++ ++/* ++ * Removing from the runqueue. Deleting a task from the skip list is done ++ * via the stored node reference in the task struct and does not require a full ++ * look up. Thus it occurs in O(k) time where k is the "level" of the list the ++ * task was stored at - usually < 4, max 16. ++ * ++ * Context: rq->lock ++ */ ++static inline void dequeue_task(struct task_struct *p, struct rq *rq, int flags) ++{ ++ lockdep_assert_held(&rq->lock); ++ ++ WARN_ONCE(task_rq(p) != rq, "pds: dequeue task reside on cpu%d from cpu%d\n", ++ task_cpu(p), cpu_of(rq)); ++ if (skiplist_del_init(&rq->sl_header, &p->sl_node)) { ++ update_sched_rq_queued_masks(rq); ++ update_sched_rq_pending_masks(rq); ++ } else if (is_second_in_rq(p, rq)) ++ update_sched_rq_pending_masks(rq); ++ rq->nr_running--; ++ ++ sched_update_tick_dependency(rq); ++ psi_dequeue(p, flags & DEQUEUE_SLEEP); ++ ++ sched_info_dequeued(rq, p); ++} ++ ++/* ++ * To determine if it's safe for a task of SCHED_IDLE to actually run as ++ * an idle task, we ensure none of the following conditions are met. ++ */ ++static inline bool idleprio_suitable(struct task_struct *p) ++{ ++ return (!freezing(p) && !signal_pending(p) && ++ !(task_contributes_to_load(p)) && !(p->flags & (PF_EXITING))); ++} ++ ++/* ++ * pds_skiplist_random_level -- Returns a pseudo-random level number for skip ++ * list node which is used in PDS run queue. ++ * ++ * In current implementation, based on testing, the first 8 bits in microseconds ++ * of niffies are suitable for random level population. ++ * find_first_bit() is used to satisfy p = 0.5 between each levels, and there ++ * should be platform hardware supported instruction(known as ctz/clz) to speed ++ * up this function. ++ * The skiplist level for a task is populated when task is created and doesn't ++ * change in task's life time. When task is being inserted into run queue, this ++ * skiplist level is set to task's sl_node->level, the skiplist insert function ++ * may change it based on current level of the skip lsit. ++ */ ++static inline int pds_skiplist_random_level(const struct task_struct *p) ++{ ++ long unsigned int randseed; ++ ++ /* ++ * 1. Some architectures don't have better than microsecond resolution ++ * so mask out ~microseconds as a factor of the random seed for skiplist ++ * insertion. ++ * 2. Use address of task structure pointer as another factor of the ++ * random seed for task burst forking scenario. ++ */ ++ randseed = (task_rq(p)->clock ^ (long unsigned int)p) >> 10; ++ ++ return find_first_bit(&randseed, NUM_SKIPLIST_LEVEL - 1); ++} ++ ++/** ++ * pds_skiplist_task_search -- search function used in PDS run queue skip list ++ * node insert operation. ++ * @it: iterator pointer to the node in the skip list ++ * @node: pointer to the skiplist_node to be inserted ++ * ++ * Returns true if key of @it is less or equal to key value of @node, otherwise ++ * false. ++ */ ++static inline bool ++pds_skiplist_task_search(struct skiplist_node *it, struct skiplist_node *node) ++{ ++ return (skiplist_entry(it, struct task_struct, sl_node)->priodl <= ++ skiplist_entry(node, struct task_struct, sl_node)->priodl); ++} ++ ++/* ++ * Define the skip list insert function for PDS ++ */ ++DEFINE_SKIPLIST_INSERT_FUNC(pds_skiplist_insert, pds_skiplist_task_search); ++ ++/* ++ * Adding task to the runqueue. ++ * ++ * Context: rq->lock ++ */ ++static inline void enqueue_task(struct task_struct *p, struct rq *rq, int flags) ++{ ++ lockdep_assert_held(&rq->lock); ++ ++ WARN_ONCE(task_rq(p) != rq, "pds: enqueue task reside on cpu%d to cpu%d\n", ++ task_cpu(p), cpu_of(rq)); ++ ++ p->sl_node.level = p->sl_level; ++ if (pds_skiplist_insert(&rq->sl_header, &p->sl_node)) { ++ update_sched_rq_queued_masks(rq); ++ update_sched_rq_pending_masks(rq); ++ } else if (is_second_in_rq(p, rq)) ++ update_sched_rq_pending_masks(rq); ++ rq->nr_running++; ++ ++ sched_update_tick_dependency(rq); ++ ++ sched_info_queued(rq, p); ++ psi_enqueue(p, flags); ++ ++ /* ++ * If in_iowait is set, the code below may not trigger any cpufreq ++ * utilization updates, so do it here explicitly with the IOWAIT flag ++ * passed. ++ */ ++ if (p->in_iowait) ++ cpufreq_update_this_cpu(rq, SCHED_CPUFREQ_IOWAIT); ++} ++ ++static inline void requeue_task(struct task_struct *p, struct rq *rq) ++{ ++ bool b_first, b_second; ++ ++ lockdep_assert_held(&rq->lock); ++ ++ WARN_ONCE(task_rq(p) != rq, "pds: cpu[%d] requeue task reside on cpu%d\n", ++ cpu_of(rq), task_cpu(p)); ++ ++ b_first = skiplist_del_init(&rq->sl_header, &p->sl_node); ++ b_second = is_second_in_rq(p, rq); ++ ++ p->sl_node.level = p->sl_level; ++ if (pds_skiplist_insert(&rq->sl_header, &p->sl_node) || b_first) { ++ update_sched_rq_queued_masks(rq); ++ update_sched_rq_pending_masks(rq); ++ } else if (is_second_in_rq(p, rq) || b_second) ++ update_sched_rq_pending_masks(rq); ++} ++ ++/* ++ * resched_curr - mark rq's current task 'to be rescheduled now'. ++ * ++ * On UP this means the setting of the need_resched flag, on SMP it ++ * might also involve a cross-CPU call to trigger the scheduler on ++ * the target CPU. ++ */ ++void resched_curr(struct rq *rq) ++{ ++ struct task_struct *curr = rq->curr; ++ int cpu; ++ ++ lockdep_assert_held(&rq->lock); ++ ++ if (test_tsk_need_resched(curr)) ++ return; ++ ++ cpu = cpu_of(rq); ++ if (cpu == smp_processor_id()) { ++ set_tsk_need_resched(curr); ++ set_preempt_need_resched(); ++ return; ++ } ++ ++ if (set_nr_and_not_polling(curr)) ++ smp_send_reschedule(cpu); ++ else ++ trace_sched_wake_idle_without_ipi(cpu); ++} ++ ++static inline void check_preempt_curr(struct rq *rq, struct task_struct *p) ++{ ++ struct task_struct *curr = rq->curr; ++ ++ if (curr->prio == PRIO_LIMIT) ++ resched_curr(rq); ++ ++ if (task_running_idle(p)) ++ return; ++ ++ if (p->priodl < curr->priodl) ++ resched_curr(rq); ++} ++ ++#ifdef CONFIG_SCHED_HRTICK ++/* ++ * Use HR-timers to deliver accurate preemption points. ++ */ ++ ++static void hrtick_clear(struct rq *rq) ++{ ++ if (hrtimer_active(&rq->hrtick_timer)) ++ hrtimer_cancel(&rq->hrtick_timer); ++} ++ ++/* ++ * High-resolution timer tick. ++ * Runs from hardirq context with interrupts disabled. ++ */ ++static enum hrtimer_restart hrtick(struct hrtimer *timer) ++{ ++ struct rq *rq = container_of(timer, struct rq, hrtick_timer); ++ struct task_struct *p; ++ ++ WARN_ON_ONCE(cpu_of(rq) != smp_processor_id()); ++ ++ raw_spin_lock(&rq->lock); ++ p = rq->curr; ++ p->time_slice = 0; ++ resched_curr(rq); ++ raw_spin_unlock(&rq->lock); ++ ++ return HRTIMER_NORESTART; ++} ++ ++/* ++ * Use hrtick when: ++ * - enabled by features ++ * - hrtimer is actually high res ++ */ ++static inline int hrtick_enabled(struct rq *rq) ++{ ++ /** ++ * PDS doesn't support sched_feat yet ++ if (!sched_feat(HRTICK)) ++ return 0; ++ */ ++ if (!cpu_active(cpu_of(rq))) ++ return 0; ++ return hrtimer_is_hres_active(&rq->hrtick_timer); ++} ++ ++#ifdef CONFIG_SMP ++ ++static void __hrtick_restart(struct rq *rq) ++{ ++ struct hrtimer *timer = &rq->hrtick_timer; ++ ++ hrtimer_start_expires(timer, HRTIMER_MODE_ABS_PINNED_HARD); ++} ++ ++/* ++ * called from hardirq (IPI) context ++ */ ++static void __hrtick_start(void *arg) ++{ ++ struct rq *rq = arg; ++ ++ raw_spin_lock(&rq->lock); ++ __hrtick_restart(rq); ++ rq->hrtick_csd_pending = 0; ++ raw_spin_unlock(&rq->lock); ++} ++ ++/* ++ * Called to set the hrtick timer state. ++ * ++ * called with rq->lock held and irqs disabled ++ */ ++void hrtick_start(struct rq *rq, u64 delay) ++{ ++ struct hrtimer *timer = &rq->hrtick_timer; ++ ktime_t time; ++ s64 delta; ++ ++ /* ++ * Don't schedule slices shorter than 10000ns, that just ++ * doesn't make sense and can cause timer DoS. ++ */ ++ delta = max_t(s64, delay, 10000LL); ++ time = ktime_add_ns(timer->base->get_time(), delta); ++ ++ hrtimer_set_expires(timer, time); ++ ++ if (rq == this_rq()) { ++ __hrtick_restart(rq); ++ } else if (!rq->hrtick_csd_pending) { ++ smp_call_function_single_async(cpu_of(rq), &rq->hrtick_csd); ++ rq->hrtick_csd_pending = 1; ++ } ++} ++ ++#else ++/* ++ * Called to set the hrtick timer state. ++ * ++ * called with rq->lock held and irqs disabled ++ */ ++void hrtick_start(struct rq *rq, u64 delay) ++{ ++ /* ++ * Don't schedule slices shorter than 10000ns, that just ++ * doesn't make sense. Rely on vruntime for fairness. ++ */ ++ delay = max_t(u64, delay, 10000LL); ++ hrtimer_start(&rq->hrtick_timer, ns_to_ktime(delay), ++ HRTIMER_MODE_REL_PINNED_HARD); ++} ++#endif /* CONFIG_SMP */ ++ ++static void hrtick_rq_init(struct rq *rq) ++{ ++#ifdef CONFIG_SMP ++ rq->hrtick_csd_pending = 0; ++ ++ rq->hrtick_csd.flags = 0; ++ rq->hrtick_csd.func = __hrtick_start; ++ rq->hrtick_csd.info = rq; ++#endif ++ ++ hrtimer_init(&rq->hrtick_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL_HARD); ++ rq->hrtick_timer.function = hrtick; ++} ++ ++static inline int rq_dither(struct rq *rq) ++{ ++ if ((rq->clock - rq->last_tick > HALF_JIFFY_NS) || hrtick_enabled(rq)) ++ return 0; ++ ++ return HALF_JIFFY_NS; ++} ++ ++#else /* CONFIG_SCHED_HRTICK */ ++static inline int hrtick_enabled(struct rq *rq) ++{ ++ return 0; ++} ++ ++static inline void hrtick_clear(struct rq *rq) ++{ ++} ++ ++static inline void hrtick_rq_init(struct rq *rq) ++{ ++} ++ ++static inline int rq_dither(struct rq *rq) ++{ ++ return (rq->clock - rq->last_tick > HALF_JIFFY_NS)? 0:HALF_JIFFY_NS; ++} ++#endif /* CONFIG_SCHED_HRTICK */ ++ ++static inline int normal_prio(struct task_struct *p) ++{ ++ static const int policy_to_prio[] = { ++ NORMAL_PRIO, /* SCHED_NORMAL */ ++ 0, /* SCHED_FIFO */ ++ 0, /* SCHED_RR */ ++ IDLE_PRIO, /* SCHED_BATCH */ ++ ISO_PRIO, /* SCHED_ISO */ ++ IDLE_PRIO /* SCHED_IDLE */ ++ }; ++ ++ if (task_has_rt_policy(p)) ++ return MAX_RT_PRIO - 1 - p->rt_priority; ++ return policy_to_prio[p->policy]; ++} ++ ++/* ++ * Calculate the current priority, i.e. the priority ++ * taken into account by the scheduler. This value might ++ * be boosted by RT tasks as it will be RT if the task got ++ * RT-boosted. If not then it returns p->normal_prio. ++ */ ++static int effective_prio(struct task_struct *p) ++{ ++ p->normal_prio = normal_prio(p); ++ /* ++ * If we are RT tasks or we were boosted to RT priority, ++ * keep the priority unchanged. Otherwise, update priority ++ * to the normal priority: ++ */ ++ if (!rt_prio(p->prio)) ++ return p->normal_prio; ++ return p->prio; ++} ++ ++/* ++ * activate_task - move a task to the runqueue. ++ * ++ * Context: rq->lock ++ */ ++static void activate_task(struct task_struct *p, struct rq *rq) ++{ ++ if (task_contributes_to_load(p)) ++ rq->nr_uninterruptible--; ++ enqueue_task(p, rq, ENQUEUE_WAKEUP); ++ p->on_rq = 1; ++ cpufreq_update_this_cpu(rq, 0); ++} ++ ++/* ++ * deactivate_task - remove a task from the runqueue. ++ * ++ * Context: rq->lock ++ */ ++static inline void deactivate_task(struct task_struct *p, struct rq *rq) ++{ ++ if (task_contributes_to_load(p)) ++ rq->nr_uninterruptible++; ++ dequeue_task(p, rq, DEQUEUE_SLEEP); ++ p->on_rq = 0; ++ cpufreq_update_this_cpu(rq, 0); ++} ++ ++static inline void __set_task_cpu(struct task_struct *p, unsigned int cpu) ++{ ++#ifdef CONFIG_SMP ++ /* ++ * After ->cpu is set up to a new value, task_access_lock(p, ...) can be ++ * successfully executed on another CPU. We must ensure that updates of ++ * per-task data have been completed by this moment. ++ */ ++ smp_wmb(); ++ ++#ifdef CONFIG_THREAD_INFO_IN_TASK ++ WRITE_ONCE(p->cpu, cpu); ++#else ++ WRITE_ONCE(task_thread_info(p)->cpu, cpu); ++#endif ++#endif ++} ++ ++#ifdef CONFIG_SMP ++void set_task_cpu(struct task_struct *p, unsigned int new_cpu) ++{ ++#ifdef CONFIG_SCHED_DEBUG ++ /* ++ * We should never call set_task_cpu() on a blocked task, ++ * ttwu() will sort out the placement. ++ */ ++ WARN_ON_ONCE(p->state != TASK_RUNNING && p->state != TASK_WAKING && ++ !p->on_rq); ++#ifdef CONFIG_LOCKDEP ++ /* ++ * The caller should hold either p->pi_lock or rq->lock, when changing ++ * a task's CPU. ->pi_lock for waking tasks, rq->lock for runnable tasks. ++ * ++ * sched_move_task() holds both and thus holding either pins the cgroup, ++ * see task_group(). ++ */ ++ WARN_ON_ONCE(debug_locks && !(lockdep_is_held(&p->pi_lock) || ++ lockdep_is_held(&task_rq(p)->lock))); ++#endif ++ /* ++ * Clearly, migrating tasks to offline CPUs is a fairly daft thing. ++ */ ++ WARN_ON_ONCE(!cpu_online(new_cpu)); ++#endif ++ if (task_cpu(p) == new_cpu) ++ return; ++ trace_sched_migrate_task(p, new_cpu); ++ rseq_migrate(p); ++ perf_event_task_migrate(p); ++ ++ __set_task_cpu(p, new_cpu); ++} ++ ++static inline bool is_per_cpu_kthread(struct task_struct *p) ++{ ++ return ((p->flags & PF_KTHREAD) && (1 == p->nr_cpus_allowed)); ++} ++ ++/* ++ * Per-CPU kthreads are allowed to run on !active && online CPUs, see ++ * __set_cpus_allowed_ptr() and select_fallback_rq(). ++ */ ++static inline bool is_cpu_allowed(struct task_struct *p, int cpu) ++{ ++ if (!cpumask_test_cpu(cpu, &p->cpus_mask)) ++ return false; ++ ++ if (is_per_cpu_kthread(p)) ++ return cpu_online(cpu); ++ ++ return cpu_active(cpu); ++} ++ ++/* ++ * This is how migration works: ++ * ++ * 1) we invoke migration_cpu_stop() on the target CPU using ++ * stop_one_cpu(). ++ * 2) stopper starts to run (implicitly forcing the migrated thread ++ * off the CPU) ++ * 3) it checks whether the migrated task is still in the wrong runqueue. ++ * 4) if it's in the wrong runqueue then the migration thread removes ++ * it and puts it into the right queue. ++ * 5) stopper completes and stop_one_cpu() returns and the migration ++ * is done. ++ */ ++ ++/* ++ * move_queued_task - move a queued task to new rq. ++ * ++ * Returns (locked) new rq. Old rq's lock is released. ++ */ ++static struct rq *move_queued_task(struct rq *rq, struct task_struct *p, int ++ new_cpu) ++{ ++ lockdep_assert_held(&rq->lock); ++ ++ p->on_rq = TASK_ON_RQ_MIGRATING; ++ dequeue_task(p, rq, 0); ++ set_task_cpu(p, new_cpu); ++ raw_spin_unlock(&rq->lock); ++ ++ rq = cpu_rq(new_cpu); ++ ++ raw_spin_lock(&rq->lock); ++ BUG_ON(task_cpu(p) != new_cpu); ++ enqueue_task(p, rq, 0); ++ p->on_rq = TASK_ON_RQ_QUEUED; ++ check_preempt_curr(rq, p); ++ ++ return rq; ++} ++ ++struct migration_arg { ++ struct task_struct *task; ++ int dest_cpu; ++}; ++ ++/* ++ * Move (not current) task off this CPU, onto the destination CPU. We're doing ++ * this because either it can't run here any more (set_cpus_allowed() ++ * away from this CPU, or CPU going down), or because we're ++ * attempting to rebalance this task on exec (sched_exec). ++ * ++ * So we race with normal scheduler movements, but that's OK, as long ++ * as the task is no longer on this CPU. ++ */ ++static struct rq *__migrate_task(struct rq *rq, struct task_struct *p, int ++ dest_cpu) ++{ ++ /* Affinity changed (again). */ ++ if (!is_cpu_allowed(p, dest_cpu)) ++ return rq; ++ ++ update_rq_clock(rq); ++ return move_queued_task(rq, p, dest_cpu); ++} ++ ++/* ++ * migration_cpu_stop - this will be executed by a highprio stopper thread ++ * and performs thread migration by bumping thread off CPU then ++ * 'pushing' onto another runqueue. ++ */ ++static int migration_cpu_stop(void *data) ++{ ++ struct migration_arg *arg = data; ++ struct task_struct *p = arg->task; ++ struct rq *rq = this_rq(); ++ ++ /* ++ * The original target CPU might have gone down and we might ++ * be on another CPU but it doesn't matter. ++ */ ++ local_irq_disable(); ++ ++ raw_spin_lock(&p->pi_lock); ++ raw_spin_lock(&rq->lock); ++ /* ++ * If task_rq(p) != rq, it cannot be migrated here, because we're ++ * holding rq->lock, if p->on_rq == 0 it cannot get enqueued because ++ * we're holding p->pi_lock. ++ */ ++ if (task_rq(p) == rq) ++ if (task_on_rq_queued(p)) ++ rq = __migrate_task(rq, p, arg->dest_cpu); ++ raw_spin_unlock(&rq->lock); ++ raw_spin_unlock(&p->pi_lock); ++ ++ local_irq_enable(); ++ return 0; ++} ++ ++static inline void ++set_cpus_allowed_common(struct task_struct *p, const struct cpumask *new_mask) ++{ ++ cpumask_copy(&p->cpus_mask, new_mask); ++ p->nr_cpus_allowed = cpumask_weight(new_mask); ++} ++ ++void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask) ++{ ++ set_cpus_allowed_common(p, new_mask); ++} ++#endif ++ ++/* Enter with rq lock held. We know p is on the local CPU */ ++static inline void __set_tsk_resched(struct task_struct *p) ++{ ++ set_tsk_need_resched(p); ++ set_preempt_need_resched(); ++} ++ ++/** ++ * task_curr - is this task currently executing on a CPU? ++ * @p: the task in question. ++ * ++ * Return: 1 if the task is currently executing. 0 otherwise. ++ */ ++inline int task_curr(const struct task_struct *p) ++{ ++ return cpu_curr(task_cpu(p)) == p; ++} ++ ++#ifdef CONFIG_SMP ++/* ++ * wait_task_inactive - wait for a thread to unschedule. ++ * ++ * If @match_state is nonzero, it's the @p->state value just checked and ++ * not expected to change. If it changes, i.e. @p might have woken up, ++ * then return zero. When we succeed in waiting for @p to be off its CPU, ++ * we return a positive number (its total switch count). If a second call ++ * a short while later returns the same number, the caller can be sure that ++ * @p has remained unscheduled the whole time. ++ * ++ * The caller must ensure that the task *will* unschedule sometime soon, ++ * else this function might spin for a *long* time. This function can't ++ * be called with interrupts off, or it may introduce deadlock with ++ * smp_call_function() if an IPI is sent by the same process we are ++ * waiting to become inactive. ++ */ ++unsigned long wait_task_inactive(struct task_struct *p, long match_state) ++{ ++ unsigned long flags; ++ bool running, on_rq; ++ unsigned long ncsw; ++ struct rq *rq; ++ raw_spinlock_t *lock; ++ ++ for (;;) { ++ rq = task_rq(p); ++ ++ /* ++ * If the task is actively running on another CPU ++ * still, just relax and busy-wait without holding ++ * any locks. ++ * ++ * NOTE! Since we don't hold any locks, it's not ++ * even sure that "rq" stays as the right runqueue! ++ * But we don't care, since this will return false ++ * if the runqueue has changed and p is actually now ++ * running somewhere else! ++ */ ++ while (task_running(p) && p == rq->curr) { ++ if (match_state && unlikely(p->state != match_state)) ++ return 0; ++ cpu_relax(); ++ } ++ ++ /* ++ * Ok, time to look more closely! We need the rq ++ * lock now, to be *sure*. If we're wrong, we'll ++ * just go back and repeat. ++ */ ++ task_access_lock_irqsave(p, &lock, &flags); ++ trace_sched_wait_task(p); ++ running = task_running(p); ++ on_rq = p->on_rq; ++ ncsw = 0; ++ if (!match_state || p->state == match_state) ++ ncsw = p->nvcsw | LONG_MIN; /* sets MSB */ ++ task_access_unlock_irqrestore(p, lock, &flags); ++ ++ /* ++ * If it changed from the expected state, bail out now. ++ */ ++ if (unlikely(!ncsw)) ++ break; ++ ++ /* ++ * Was it really running after all now that we ++ * checked with the proper locks actually held? ++ * ++ * Oops. Go back and try again.. ++ */ ++ if (unlikely(running)) { ++ cpu_relax(); ++ continue; ++ } ++ ++ /* ++ * It's not enough that it's not actively running, ++ * it must be off the runqueue _entirely_, and not ++ * preempted! ++ * ++ * So if it was still runnable (but just not actively ++ * running right now), it's preempted, and we should ++ * yield - it could be a while. ++ */ ++ if (unlikely(on_rq)) { ++ ktime_t to = NSEC_PER_SEC / HZ; ++ ++ set_current_state(TASK_UNINTERRUPTIBLE); ++ schedule_hrtimeout(&to, HRTIMER_MODE_REL); ++ continue; ++ } ++ ++ /* ++ * Ahh, all good. It wasn't running, and it wasn't ++ * runnable, which means that it will never become ++ * running in the future either. We're all done! ++ */ ++ break; ++ } ++ ++ return ncsw; ++} ++ ++/*** ++ * kick_process - kick a running thread to enter/exit the kernel ++ * @p: the to-be-kicked thread ++ * ++ * Cause a process which is running on another CPU to enter ++ * kernel-mode, without any delay. (to get signals handled.) ++ * ++ * NOTE: this function doesn't have to take the runqueue lock, ++ * because all it wants to ensure is that the remote task enters ++ * the kernel. If the IPI races and the task has been migrated ++ * to another CPU then no harm is done and the purpose has been ++ * achieved as well. ++ */ ++void kick_process(struct task_struct *p) ++{ ++ int cpu; ++ ++ preempt_disable(); ++ cpu = task_cpu(p); ++ if ((cpu != smp_processor_id()) && task_curr(p)) ++ smp_send_reschedule(cpu); ++ preempt_enable(); ++} ++EXPORT_SYMBOL_GPL(kick_process); ++ ++/* ++ * ->cpus_mask is protected by both rq->lock and p->pi_lock ++ * ++ * A few notes on cpu_active vs cpu_online: ++ * ++ * - cpu_active must be a subset of cpu_online ++ * ++ * - on CPU-up we allow per-CPU kthreads on the online && !active CPU, ++ * see __set_cpus_allowed_ptr(). At this point the newly online ++ * CPU isn't yet part of the sched domains, and balancing will not ++ * see it. ++ * ++ * - on cpu-down we clear cpu_active() to mask the sched domains and ++ * avoid the load balancer to place new tasks on the to be removed ++ * CPU. Existing tasks will remain running there and will be taken ++ * off. ++ * ++ * This means that fallback selection must not select !active CPUs. ++ * And can assume that any active CPU must be online. Conversely ++ * select_task_rq() below may allow selection of !active CPUs in order ++ * to satisfy the above rules. ++ */ ++static int select_fallback_rq(int cpu, struct task_struct *p) ++{ ++ int nid = cpu_to_node(cpu); ++ const struct cpumask *nodemask = NULL; ++ enum { cpuset, possible, fail } state = cpuset; ++ int dest_cpu; ++ ++ /* ++ * If the node that the CPU is on has been offlined, cpu_to_node() ++ * will return -1. There is no CPU on the node, and we should ++ * select the CPU on the other node. ++ */ ++ if (nid != -1) { ++ nodemask = cpumask_of_node(nid); ++ ++ /* Look for allowed, online CPU in same node. */ ++ for_each_cpu(dest_cpu, nodemask) { ++ if (!cpu_active(dest_cpu)) ++ continue; ++ if (cpumask_test_cpu(dest_cpu, &p->cpus_mask)) ++ return dest_cpu; ++ } ++ } ++ ++ for (;;) { ++ /* Any allowed, online CPU? */ ++ for_each_cpu(dest_cpu, &p->cpus_mask) { ++ if (!is_cpu_allowed(p, dest_cpu)) ++ continue; ++ goto out; ++ } ++ ++ /* No more Mr. Nice Guy. */ ++ switch (state) { ++ case cpuset: ++ if (IS_ENABLED(CONFIG_CPUSETS)) { ++ cpuset_cpus_allowed_fallback(p); ++ state = possible; ++ break; ++ } ++ /* Fall-through */ ++ case possible: ++ do_set_cpus_allowed(p, cpu_possible_mask); ++ state = fail; ++ break; ++ ++ case fail: ++ BUG(); ++ break; ++ } ++ } ++ ++out: ++ if (state != cpuset) { ++ /* ++ * Don't tell them about moving exiting tasks or ++ * kernel threads (both mm NULL), since they never ++ * leave kernel. ++ */ ++ if (p->mm && printk_ratelimit()) { ++ printk_deferred("process %d (%s) no longer affine to cpu%d\n", ++ task_pid_nr(p), p->comm, cpu); ++ } ++ } ++ ++ return dest_cpu; ++} ++ ++static inline int best_mask_cpu(int cpu, const cpumask_t *cpumask) ++{ ++ cpumask_t *mask; ++ ++ if (cpumask_test_cpu(cpu, cpumask)) ++ return cpu; ++ ++ mask = &(per_cpu(sched_cpu_affinity_chk_masks, cpu)[0]); ++ while ((cpu = cpumask_any_and(cpumask, mask)) >= nr_cpu_ids) ++ mask++; ++ ++ return cpu; ++} ++ ++/* ++ * task_preemptible_rq - return the rq which the given task can preempt on ++ * @p: task wants to preempt CPU ++ * @only_preempt_low_policy: indicate only preempt rq running low policy than @p ++ */ ++static inline int ++task_preemptible_rq_idle(struct task_struct *p, cpumask_t *chk_mask) ++{ ++ cpumask_t tmp; ++ ++#ifdef CONFIG_SCHED_SMT ++ if (cpumask_and(&tmp, chk_mask, &sched_cpu_sg_idle_mask)) ++ return best_mask_cpu(task_cpu(p), &tmp); ++#endif ++ ++#ifdef CONFIG_SMT_NICE ++ /* Only ttwu on cpu which is not smt supressed */ ++ if (cpumask_andnot(&tmp, chk_mask, &sched_smt_supressed_mask)) { ++ cpumask_t t; ++ if (cpumask_and(&t, &tmp, &sched_rq_queued_masks[SCHED_RQ_EMPTY])) ++ return best_mask_cpu(task_cpu(p), &t); ++ return best_mask_cpu(task_cpu(p), &tmp); ++ } ++#endif ++ ++ if (cpumask_and(&tmp, chk_mask, &sched_rq_queued_masks[SCHED_RQ_EMPTY])) ++ return best_mask_cpu(task_cpu(p), &tmp); ++ return best_mask_cpu(task_cpu(p), chk_mask); ++} ++ ++static inline int ++task_preemptible_rq(struct task_struct *p, cpumask_t *chk_mask, ++ int preempt_level) ++{ ++ cpumask_t tmp; ++ int level; ++ ++#ifdef CONFIG_SCHED_SMT ++#ifdef CONFIG_SMT_NICE ++ if (cpumask_and(&tmp, chk_mask, &sched_cpu_psg_mask)) ++ return best_mask_cpu(task_cpu(p), &tmp); ++#else ++ if (cpumask_and(&tmp, chk_mask, &sched_cpu_sg_idle_mask)) ++ return best_mask_cpu(task_cpu(p), &tmp); ++#endif ++#endif ++ ++ level = find_first_bit(sched_rq_queued_masks_bitmap, ++ NR_SCHED_RQ_QUEUED_LEVEL); ++ ++ while (level < preempt_level) { ++ if (cpumask_and(&tmp, chk_mask, &sched_rq_queued_masks[level])) ++ return best_mask_cpu(task_cpu(p), &tmp); ++ ++ level = find_next_bit(sched_rq_queued_masks_bitmap, ++ NR_SCHED_RQ_QUEUED_LEVEL, ++ level + 1); ++ } ++ ++ if (unlikely(SCHED_RQ_RT == level && ++ level == preempt_level && ++ cpumask_and(&tmp, chk_mask, ++ &sched_rq_queued_masks[SCHED_RQ_RT]))) { ++ unsigned int cpu; ++ ++ for_each_cpu (cpu, &tmp) ++ if (p->prio < sched_rq_prio[cpu]) ++ return cpu; ++ } ++ ++ return best_mask_cpu(task_cpu(p), chk_mask); ++} ++ ++static inline int select_task_rq(struct task_struct *p) ++{ ++ cpumask_t chk_mask; ++ ++ if (unlikely(!cpumask_and(&chk_mask, &p->cpus_mask, cpu_online_mask))) ++ return select_fallback_rq(task_cpu(p), p); ++ ++ /* Check IDLE tasks suitable to run normal priority */ ++ if (idleprio_task(p)) { ++ if (idleprio_suitable(p)) { ++ p->prio = p->normal_prio; ++ update_task_priodl(p); ++ return task_preemptible_rq_idle(p, &chk_mask); ++ } ++ p->prio = NORMAL_PRIO; ++ update_task_priodl(p); ++ } ++ ++ return task_preemptible_rq(p, &chk_mask, ++ task_running_policy_level(p, this_rq())); ++} ++#else /* CONFIG_SMP */ ++static inline int select_task_rq(struct task_struct *p) ++{ ++ return 0; ++} ++#endif /* CONFIG_SMP */ ++ ++static void ++ttwu_stat(struct task_struct *p, int cpu, int wake_flags) ++{ ++ struct rq *rq; ++ ++ if (!schedstat_enabled()) ++ return; ++ ++ rq= this_rq(); ++ ++#ifdef CONFIG_SMP ++ if (cpu == rq->cpu) ++ __schedstat_inc(rq->ttwu_local); ++ else { ++ /** PDS ToDo: ++ * How to do ttwu_wake_remote ++ */ ++ } ++#endif /* CONFIG_SMP */ ++ ++ __schedstat_inc(rq->ttwu_count); ++} ++ ++/* ++ * Mark the task runnable and perform wakeup-preemption. ++ */ ++static inline void ++ttwu_do_wakeup(struct rq *rq, struct task_struct *p, int wake_flags) ++{ ++ p->state = TASK_RUNNING; ++ trace_sched_wakeup(p); ++} ++ ++static inline void ++ttwu_do_activate(struct rq *rq, struct task_struct *p, int wake_flags) ++{ ++#ifdef CONFIG_SMP ++ if (p->sched_contributes_to_load) ++ rq->nr_uninterruptible--; ++#endif ++ ++ activate_task(p, rq); ++ ttwu_do_wakeup(rq, p, 0); ++} ++ ++static int ttwu_remote(struct task_struct *p, int wake_flags) ++{ ++ struct rq *rq; ++ raw_spinlock_t *lock; ++ int ret = 0; ++ ++ rq = __task_access_lock(p, &lock); ++ if (task_on_rq_queued(p)) { ++ ttwu_do_wakeup(rq, p, wake_flags); ++ ret = 1; ++ } ++ __task_access_unlock(p, lock); ++ ++ return ret; ++} ++ ++/* ++ * Notes on Program-Order guarantees on SMP systems. ++ * ++ * MIGRATION ++ * ++ * The basic program-order guarantee on SMP systems is that when a task [t] ++ * migrates, all its activity on its old CPU [c0] happens-before any subsequent ++ * execution on its new CPU [c1]. ++ * ++ * For migration (of runnable tasks) this is provided by the following means: ++ * ++ * A) UNLOCK of the rq(c0)->lock scheduling out task t ++ * B) migration for t is required to synchronize *both* rq(c0)->lock and ++ * rq(c1)->lock (if not at the same time, then in that order). ++ * C) LOCK of the rq(c1)->lock scheduling in task ++ * ++ * Transitivity guarantees that B happens after A and C after B. ++ * Note: we only require RCpc transitivity. ++ * Note: the CPU doing B need not be c0 or c1 ++ * ++ * Example: ++ * ++ * CPU0 CPU1 CPU2 ++ * ++ * LOCK rq(0)->lock ++ * sched-out X ++ * sched-in Y ++ * UNLOCK rq(0)->lock ++ * ++ * LOCK rq(0)->lock // orders against CPU0 ++ * dequeue X ++ * UNLOCK rq(0)->lock ++ * ++ * LOCK rq(1)->lock ++ * enqueue X ++ * UNLOCK rq(1)->lock ++ * ++ * LOCK rq(1)->lock // orders against CPU2 ++ * sched-out Z ++ * sched-in X ++ * UNLOCK rq(1)->lock ++ * ++ * ++ * BLOCKING -- aka. SLEEP + WAKEUP ++ * ++ * For blocking we (obviously) need to provide the same guarantee as for ++ * migration. However the means are completely different as there is no lock ++ * chain to provide order. Instead we do: ++ * ++ * 1) smp_store_release(X->on_cpu, 0) ++ * 2) smp_cond_load_acquire(!X->on_cpu) ++ * ++ * Example: ++ * ++ * CPU0 (schedule) CPU1 (try_to_wake_up) CPU2 (schedule) ++ * ++ * LOCK rq(0)->lock LOCK X->pi_lock ++ * dequeue X ++ * sched-out X ++ * smp_store_release(X->on_cpu, 0); ++ * ++ * smp_cond_load_acquire(&X->on_cpu, !VAL); ++ * X->state = WAKING ++ * set_task_cpu(X,2) ++ * ++ * LOCK rq(2)->lock ++ * enqueue X ++ * X->state = RUNNING ++ * UNLOCK rq(2)->lock ++ * ++ * LOCK rq(2)->lock // orders against CPU1 ++ * sched-out Z ++ * sched-in X ++ * UNLOCK rq(2)->lock ++ * ++ * UNLOCK X->pi_lock ++ * UNLOCK rq(0)->lock ++ * ++ * ++ * However; for wakeups there is a second guarantee we must provide, namely we ++ * must observe the state that lead to our wakeup. That is, not only must our ++ * task observe its own prior state, it must also observe the stores prior to ++ * its wakeup. ++ * ++ * This means that any means of doing remote wakeups must order the CPU doing ++ * the wakeup against the CPU the task is going to end up running on. This, ++ * however, is already required for the regular Program-Order guarantee above, ++ * since the waking CPU is the one issueing the ACQUIRE (smp_cond_load_acquire). ++ * ++ */ ++ ++/*** ++ * try_to_wake_up - wake up a thread ++ * @p: the thread to be awakened ++ * @state: the mask of task states that can be woken ++ * @wake_flags: wake modifier flags (WF_*) ++ * ++ * Put it on the run-queue if it's not already there. The "current" ++ * thread is always on the run-queue (except when the actual ++ * re-schedule is in progress), and as such you're allowed to do ++ * the simpler "current->state = TASK_RUNNING" to mark yourself ++ * runnable without the overhead of this. ++ * ++ * Return: %true if @p was woken up, %false if it was already running. ++ * or @state didn't match @p's state. ++ */ ++static int try_to_wake_up(struct task_struct *p, unsigned int state, ++ int wake_flags) ++{ ++ unsigned long flags; ++ struct rq *rq; ++ int cpu, success = 0; ++ ++ /* ++ * If we are going to wake up a thread waiting for CONDITION we ++ * need to ensure that CONDITION=1 done by the caller can not be ++ * reordered with p->state check below. This pairs with mb() in ++ * set_current_state() the waiting thread does. ++ */ ++ raw_spin_lock_irqsave(&p->pi_lock, flags); ++ smp_mb__after_spinlock(); ++ if (!(p->state & state)) ++ goto out; ++ ++ trace_sched_waking(p); ++ ++ /* We're going to change ->state: */ ++ success = 1; ++ cpu = task_cpu(p); ++ ++ /* ++ * Ensure we load p->on_rq _after_ p->state, otherwise it would ++ * be possible to, falsely, observe p->on_rq == 0 and get stuck ++ * in smp_cond_load_acquire() below. ++ * ++ * sched_ttwu_pending() try_to_wake_up() ++ * STORE p->on_rq = 1 LOAD p->state ++ * UNLOCK rq->lock ++ * ++ * __schedule() (switch to task 'p') ++ * LOCK rq->lock smp_rmb(); ++ * smp_mb__after_spinlock(); ++ * UNLOCK rq->lock ++ * ++ * [task p] ++ * STORE p->state = UNINTERRUPTIBLE LOAD p->on_rq ++ * ++ * Pairs with the LOCK+smp_mb__after_spinlock() on rq->lock in ++ * __schedule(). See the comment for smp_mb__after_spinlock(). ++ */ ++ smp_rmb(); ++ if (p->on_rq && ttwu_remote(p, wake_flags)) ++ goto stat; ++ ++#ifdef CONFIG_SMP ++ /* ++ * Ensure we load p->on_cpu _after_ p->on_rq, otherwise it would be ++ * possible to, falsely, observe p->on_cpu == 0. ++ * ++ * One must be running (->on_cpu == 1) in order to remove oneself ++ * from the runqueue. ++ * ++ * __schedule() (switch to task 'p') try_to_wake_up() ++ * STORE p->on_cpu = 1 LOAD p->on_rq ++ * UNLOCK rq->lock ++ * ++ * __schedule() (put 'p' to sleep) ++ * LOCK rq->lock smp_rmb(); ++ * smp_mb__after_spinlock(); ++ * STORE p->on_rq = 0 LOAD p->on_cpu ++ * ++ * Pairs with the LOCK+smp_mb__after_spinlock() on rq->lock in ++ * __schedule(). See the comment for smp_mb__after_spinlock(). ++ */ ++ smp_rmb(); ++ ++ /* ++ * If the owning (remote) CPU is still in the middle of schedule() with ++ * this task as prev, wait until its done referencing the task. ++ * ++ * Pairs with the smp_store_release() in finish_task(). ++ * ++ * This ensures that tasks getting woken will be fully ordered against ++ * their previous state and preserve Program Order. ++ */ ++ smp_cond_load_acquire(&p->on_cpu, !VAL); ++ ++ p->sched_contributes_to_load = !!task_contributes_to_load(p); ++ p->state = TASK_WAKING; ++ ++ if (p->in_iowait) { ++ delayacct_blkio_end(p); ++ atomic_dec(&task_rq(p)->nr_iowait); ++ } ++ ++ if (SCHED_ISO == p->policy && ISO_PRIO != p->prio) { ++ p->prio = ISO_PRIO; ++ p->deadline = 0UL; ++ update_task_priodl(p); ++ } ++ ++ cpu = select_task_rq(p); ++ ++ if (cpu != task_cpu(p)) { ++ wake_flags |= WF_MIGRATED; ++ psi_ttwu_dequeue(p); ++ set_task_cpu(p, cpu); ++ } ++#else /* CONFIG_SMP */ ++ if (p->in_iowait) { ++ delayacct_blkio_end(p); ++ atomic_dec(&task_rq(p)->nr_iowait); ++ } ++#endif ++ ++ rq = cpu_rq(cpu); ++ raw_spin_lock(&rq->lock); ++ ++ update_rq_clock(rq); ++ ttwu_do_activate(rq, p, wake_flags); ++ check_preempt_curr(rq, p); ++ ++ raw_spin_unlock(&rq->lock); ++ ++stat: ++ ttwu_stat(p, cpu, wake_flags); ++out: ++ raw_spin_unlock_irqrestore(&p->pi_lock, flags); ++ ++ return success; ++} ++ ++/** ++ * wake_up_process - Wake up a specific process ++ * @p: The process to be woken up. ++ * ++ * Attempt to wake up the nominated process and move it to the set of runnable ++ * processes. ++ * ++ * Return: 1 if the process was woken up, 0 if it was already running. ++ * ++ * This function executes a full memory barrier before accessing the task state. ++ */ ++int wake_up_process(struct task_struct *p) ++{ ++ return try_to_wake_up(p, TASK_NORMAL, 0); ++} ++EXPORT_SYMBOL(wake_up_process); ++ ++int wake_up_state(struct task_struct *p, unsigned int state) ++{ ++ return try_to_wake_up(p, state, 0); ++} ++ ++/* ++ * Perform scheduler related setup for a newly forked process p. ++ * p is forked by current. ++ */ ++int sched_fork(unsigned long __maybe_unused clone_flags, struct task_struct *p) ++{ ++ unsigned long flags; ++ int cpu = get_cpu(); ++ struct rq *rq = this_rq(); ++ ++#ifdef CONFIG_PREEMPT_NOTIFIERS ++ INIT_HLIST_HEAD(&p->preempt_notifiers); ++#endif ++ /* Should be reset in fork.c but done here for ease of PDS patching */ ++ p->on_cpu = ++ p->on_rq = ++ p->utime = ++ p->stime = ++ p->sched_time = 0; ++ ++ p->sl_level = pds_skiplist_random_level(p); ++ INIT_SKIPLIST_NODE(&p->sl_node); ++ ++#ifdef CONFIG_COMPACTION ++ p->capture_control = NULL; ++#endif ++ ++ /* ++ * We mark the process as NEW here. This guarantees that ++ * nobody will actually run it, and a signal or other external ++ * event cannot wake it up and insert it on the runqueue either. ++ */ ++ p->state = TASK_NEW; ++ ++ /* ++ * Make sure we do not leak PI boosting priority to the child. ++ */ ++ p->prio = current->normal_prio; ++ ++ /* ++ * Revert to default priority/policy on fork if requested. ++ */ ++ if (unlikely(p->sched_reset_on_fork)) { ++ if (task_has_rt_policy(p)) { ++ p->policy = SCHED_NORMAL; ++ p->static_prio = NICE_TO_PRIO(0); ++ p->rt_priority = 0; ++ } else if (PRIO_TO_NICE(p->static_prio) < 0) ++ p->static_prio = NICE_TO_PRIO(0); ++ ++ p->prio = p->normal_prio = normal_prio(p); ++ ++ /* ++ * We don't need the reset flag anymore after the fork. It has ++ * fulfilled its duty: ++ */ ++ p->sched_reset_on_fork = 0; ++ } ++ ++ /* ++ * Share the timeslice between parent and child, thus the ++ * total amount of pending timeslices in the system doesn't change, ++ * resulting in more scheduling fairness. ++ */ ++ raw_spin_lock_irqsave(&rq->lock, flags); ++ rq->curr->time_slice /= 2; ++ p->time_slice = rq->curr->time_slice; ++#ifdef CONFIG_SCHED_HRTICK ++ hrtick_start(rq, US_TO_NS(rq->curr->time_slice)); ++#endif ++ ++ if (p->time_slice < RESCHED_US) { ++ update_rq_clock(rq); ++ time_slice_expired(p, rq); ++ resched_curr(rq); ++ } else ++ update_task_priodl(p); ++ raw_spin_unlock_irqrestore(&rq->lock, flags); ++ ++ /* ++ * The child is not yet in the pid-hash so no cgroup attach races, ++ * and the cgroup is pinned to this child due to cgroup_fork() ++ * is ran before sched_fork(). ++ * ++ * Silence PROVE_RCU. ++ */ ++ raw_spin_lock_irqsave(&p->pi_lock, flags); ++ /* ++ * We're setting the CPU for the first time, we don't migrate, ++ * so use __set_task_cpu(). ++ */ ++ __set_task_cpu(p, cpu); ++ raw_spin_unlock_irqrestore(&p->pi_lock, flags); ++ ++#ifdef CONFIG_SCHED_INFO ++ if (unlikely(sched_info_on())) ++ memset(&p->sched_info, 0, sizeof(p->sched_info)); ++#endif ++ init_task_preempt_count(p); ++ ++ put_cpu(); ++ return 0; ++} ++ ++#ifdef CONFIG_SCHEDSTATS ++ ++DEFINE_STATIC_KEY_FALSE(sched_schedstats); ++static bool __initdata __sched_schedstats = false; ++ ++static void set_schedstats(bool enabled) ++{ ++ if (enabled) ++ static_branch_enable(&sched_schedstats); ++ else ++ static_branch_disable(&sched_schedstats); ++} ++ ++void force_schedstat_enabled(void) ++{ ++ if (!schedstat_enabled()) { ++ pr_info("kernel profiling enabled schedstats, disable via kernel.sched_schedstats.\n"); ++ static_branch_enable(&sched_schedstats); ++ } ++} ++ ++static int __init setup_schedstats(char *str) ++{ ++ int ret = 0; ++ if (!str) ++ goto out; ++ ++ /* ++ * This code is called before jump labels have been set up, so we can't ++ * change the static branch directly just yet. Instead set a temporary ++ * variable so init_schedstats() can do it later. ++ */ ++ if (!strcmp(str, "enable")) { ++ __sched_schedstats = true; ++ ret = 1; ++ } else if (!strcmp(str, "disable")) { ++ __sched_schedstats = false; ++ ret = 1; ++ } ++out: ++ if (!ret) ++ pr_warn("Unable to parse schedstats=\n"); ++ ++ return ret; ++} ++__setup("schedstats=", setup_schedstats); ++ ++static void __init init_schedstats(void) ++{ ++ set_schedstats(__sched_schedstats); ++} ++ ++#ifdef CONFIG_PROC_SYSCTL ++int sysctl_schedstats(struct ctl_table *table, int write, ++ void __user *buffer, size_t *lenp, loff_t *ppos) ++{ ++ struct ctl_table t; ++ int err; ++ int state = static_branch_likely(&sched_schedstats); ++ ++ if (write && !capable(CAP_SYS_ADMIN)) ++ return -EPERM; ++ ++ t = *table; ++ t.data = &state; ++ err = proc_dointvec_minmax(&t, write, buffer, lenp, ppos); ++ if (err < 0) ++ return err; ++ if (write) ++ set_schedstats(state); ++ return err; ++} ++#endif /* CONFIG_PROC_SYSCTL */ ++#else /* !CONFIG_SCHEDSTATS */ ++static inline void init_schedstats(void) {} ++#endif /* CONFIG_SCHEDSTATS */ ++ ++/* ++ * wake_up_new_task - wake up a newly created task for the first time. ++ * ++ * This function will do some initial scheduler statistics housekeeping ++ * that must be done for every newly created context, then puts the task ++ * on the runqueue and wakes it. ++ */ ++void wake_up_new_task(struct task_struct *p) ++{ ++ unsigned long flags; ++ struct rq *rq; ++ ++ raw_spin_lock_irqsave(&p->pi_lock, flags); ++ ++ p->state = TASK_RUNNING; ++ ++ rq = cpu_rq(select_task_rq(p)); ++#ifdef CONFIG_SMP ++ /* ++ * Fork balancing, do it here and not earlier because: ++ * - cpus_mask can change in the fork path ++ * - any previously selected CPU might disappear through hotplug ++ * Use __set_task_cpu() to avoid calling sched_class::migrate_task_rq, ++ * as we're not fully set-up yet. ++ */ ++ __set_task_cpu(p, cpu_of(rq)); ++#endif ++ ++ raw_spin_lock(&rq->lock); ++ ++ update_rq_clock(rq); ++ activate_task(p, rq); ++ trace_sched_wakeup_new(p); ++ check_preempt_curr(rq, p); ++ ++ raw_spin_unlock(&rq->lock); ++ raw_spin_unlock_irqrestore(&p->pi_lock, flags); ++} ++ ++#ifdef CONFIG_PREEMPT_NOTIFIERS ++ ++static DEFINE_STATIC_KEY_FALSE(preempt_notifier_key); ++ ++void preempt_notifier_inc(void) ++{ ++ static_branch_inc(&preempt_notifier_key); ++} ++EXPORT_SYMBOL_GPL(preempt_notifier_inc); ++ ++void preempt_notifier_dec(void) ++{ ++ static_branch_dec(&preempt_notifier_key); ++} ++EXPORT_SYMBOL_GPL(preempt_notifier_dec); ++ ++/** ++ * preempt_notifier_register - tell me when current is being preempted & rescheduled ++ * @notifier: notifier struct to register ++ */ ++void preempt_notifier_register(struct preempt_notifier *notifier) ++{ ++ if (!static_branch_unlikely(&preempt_notifier_key)) ++ WARN(1, "registering preempt_notifier while notifiers disabled\n"); ++ ++ hlist_add_head(¬ifier->link, ¤t->preempt_notifiers); ++} ++EXPORT_SYMBOL_GPL(preempt_notifier_register); ++ ++/** ++ * preempt_notifier_unregister - no longer interested in preemption notifications ++ * @notifier: notifier struct to unregister ++ * ++ * This is *not* safe to call from within a preemption notifier. ++ */ ++void preempt_notifier_unregister(struct preempt_notifier *notifier) ++{ ++ hlist_del(¬ifier->link); ++} ++EXPORT_SYMBOL_GPL(preempt_notifier_unregister); ++ ++static void __fire_sched_in_preempt_notifiers(struct task_struct *curr) ++{ ++ struct preempt_notifier *notifier; ++ ++ hlist_for_each_entry(notifier, &curr->preempt_notifiers, link) ++ notifier->ops->sched_in(notifier, raw_smp_processor_id()); ++} ++ ++static __always_inline void fire_sched_in_preempt_notifiers(struct task_struct *curr) ++{ ++ if (static_branch_unlikely(&preempt_notifier_key)) ++ __fire_sched_in_preempt_notifiers(curr); ++} ++ ++static void ++__fire_sched_out_preempt_notifiers(struct task_struct *curr, ++ struct task_struct *next) ++{ ++ struct preempt_notifier *notifier; ++ ++ hlist_for_each_entry(notifier, &curr->preempt_notifiers, link) ++ notifier->ops->sched_out(notifier, next); ++} ++ ++static __always_inline void ++fire_sched_out_preempt_notifiers(struct task_struct *curr, ++ struct task_struct *next) ++{ ++ if (static_branch_unlikely(&preempt_notifier_key)) ++ __fire_sched_out_preempt_notifiers(curr, next); ++} ++ ++#else /* !CONFIG_PREEMPT_NOTIFIERS */ ++ ++static inline void fire_sched_in_preempt_notifiers(struct task_struct *curr) ++{ ++} ++ ++static inline void ++fire_sched_out_preempt_notifiers(struct task_struct *curr, ++ struct task_struct *next) ++{ ++} ++ ++#endif /* CONFIG_PREEMPT_NOTIFIERS */ ++ ++static inline void prepare_task(struct task_struct *next) ++{ ++ /* ++ * Claim the task as running, we do this before switching to it ++ * such that any running task will have this set. ++ */ ++ next->on_cpu = 1; ++} ++ ++static inline void finish_task(struct task_struct *prev) ++{ ++#ifdef CONFIG_SMP ++ /* ++ * After ->on_cpu is cleared, the task can be moved to a different CPU. ++ * We must ensure this doesn't happen until the switch is completely ++ * finished. ++ * ++ * In particular, the load of prev->state in finish_task_switch() must ++ * happen before this. ++ * ++ * Pairs with the smp_cond_load_acquire() in try_to_wake_up(). ++ */ ++ smp_store_release(&prev->on_cpu, 0); ++#else ++ prev->on_cpu = 0; ++#endif ++} ++ ++static inline void ++prepare_lock_switch(struct rq *rq, struct task_struct *next) ++{ ++ /* ++ * Since the runqueue lock will be released by the next ++ * task (which is an invalid locking op but in the case ++ * of the scheduler it's an obvious special-case), so we ++ * do an early lockdep release here: ++ */ ++ spin_release(&rq->lock.dep_map, _THIS_IP_); ++#ifdef CONFIG_DEBUG_SPINLOCK ++ /* this is a valid case when another task releases the spinlock */ ++ rq->lock.owner = next; ++#endif ++} ++ ++static inline void finish_lock_switch(struct rq *rq) ++{ ++ /* ++ * If we are tracking spinlock dependencies then we have to ++ * fix up the runqueue lock - which gets 'carried over' from ++ * prev into current: ++ */ ++ spin_acquire(&rq->lock.dep_map, 0, 0, _THIS_IP_); ++ raw_spin_unlock_irq(&rq->lock); ++} ++ ++/** ++ * prepare_task_switch - prepare to switch tasks ++ * @rq: the runqueue preparing to switch ++ * @next: the task we are going to switch to. ++ * ++ * This is called with the rq lock held and interrupts off. It must ++ * be paired with a subsequent finish_task_switch after the context ++ * switch. ++ * ++ * prepare_task_switch sets up locking and calls architecture specific ++ * hooks. ++ */ ++static inline void ++prepare_task_switch(struct rq *rq, struct task_struct *prev, ++ struct task_struct *next) ++{ ++ kcov_prepare_switch(prev); ++ sched_info_switch(rq, prev, next); ++ perf_event_task_sched_out(prev, next); ++ rseq_preempt(prev); ++ fire_sched_out_preempt_notifiers(prev, next); ++ prepare_task(next); ++ prepare_arch_switch(next); ++} ++ ++/** ++ * finish_task_switch - clean up after a task-switch ++ * @rq: runqueue associated with task-switch ++ * @prev: the thread we just switched away from. ++ * ++ * finish_task_switch must be called after the context switch, paired ++ * with a prepare_task_switch call before the context switch. ++ * finish_task_switch will reconcile locking set up by prepare_task_switch, ++ * and do any other architecture-specific cleanup actions. ++ * ++ * Note that we may have delayed dropping an mm in context_switch(). If ++ * so, we finish that here outside of the runqueue lock. (Doing it ++ * with the lock held can cause deadlocks; see schedule() for ++ * details.) ++ * ++ * The context switch have flipped the stack from under us and restored the ++ * local variables which were saved when this task called schedule() in the ++ * past. prev == current is still correct but we need to recalculate this_rq ++ * because prev may have moved to another CPU. ++ */ ++static struct rq *finish_task_switch(struct task_struct *prev) ++ __releases(rq->lock) ++{ ++ struct rq *rq = this_rq(); ++ struct mm_struct *mm = rq->prev_mm; ++ long prev_state; ++ ++ /* ++ * The previous task will have left us with a preempt_count of 2 ++ * because it left us after: ++ * ++ * schedule() ++ * preempt_disable(); // 1 ++ * __schedule() ++ * raw_spin_lock_irq(&rq->lock) // 2 ++ * ++ * Also, see FORK_PREEMPT_COUNT. ++ */ ++ if (WARN_ONCE(preempt_count() != 2*PREEMPT_DISABLE_OFFSET, ++ "corrupted preempt_count: %s/%d/0x%x\n", ++ current->comm, current->pid, preempt_count())) ++ preempt_count_set(FORK_PREEMPT_COUNT); ++ ++ rq->prev_mm = NULL; ++ ++ /* ++ * A task struct has one reference for the use as "current". ++ * If a task dies, then it sets TASK_DEAD in tsk->state and calls ++ * schedule one last time. The schedule call will never return, and ++ * the scheduled task must drop that reference. ++ * ++ * We must observe prev->state before clearing prev->on_cpu (in ++ * finish_task), otherwise a concurrent wakeup can get prev ++ * running on another CPU and we could rave with its RUNNING -> DEAD ++ * transition, resulting in a double drop. ++ */ ++ prev_state = prev->state; ++ vtime_task_switch(prev); ++ perf_event_task_sched_in(prev, current); ++ finish_task(prev); ++ finish_lock_switch(rq); ++ finish_arch_post_lock_switch(); ++ kcov_finish_switch(current); ++ ++ fire_sched_in_preempt_notifiers(current); ++ /* ++ * When switching through a kernel thread, the loop in ++ * membarrier_{private,global}_expedited() may have observed that ++ * kernel thread and not issued an IPI. It is therefore possible to ++ * schedule between user->kernel->user threads without passing though ++ * switch_mm(). Membarrier requires a barrier after storing to ++ * rq->curr, before returning to userspace, so provide them here: ++ * ++ * - a full memory barrier for {PRIVATE,GLOBAL}_EXPEDITED, implicitly ++ * provided by mmdrop(), ++ * - a sync_core for SYNC_CORE. ++ */ ++ if (mm) { ++ membarrier_mm_sync_core_before_usermode(mm); ++ mmdrop(mm); ++ } ++ if (unlikely(prev_state == TASK_DEAD)) { ++ /* ++ * Remove function-return probe instances associated with this ++ * task and put them back on the free list. ++ */ ++ kprobe_flush_task(prev); ++ ++ /* Task is done with its stack. */ ++ put_task_stack(prev); ++ ++ put_task_struct_rcu_user(prev); ++ } ++ ++ tick_nohz_task_switch(); ++ return rq; ++} ++ ++/** ++ * schedule_tail - first thing a freshly forked thread must call. ++ * @prev: the thread we just switched away from. ++ */ ++asmlinkage __visible void schedule_tail(struct task_struct *prev) ++ __releases(rq->lock) ++{ ++ struct rq *rq; ++ ++ /* ++ * New tasks start with FORK_PREEMPT_COUNT, see there and ++ * finish_task_switch() for details. ++ * ++ * finish_task_switch() will drop rq->lock() and lower preempt_count ++ * and the preempt_enable() will end up enabling preemption (on ++ * PREEMPT_COUNT kernels). ++ */ ++ ++ rq = finish_task_switch(prev); ++ preempt_enable(); ++ ++ if (current->set_child_tid) ++ put_user(task_pid_vnr(current), current->set_child_tid); ++ ++ calculate_sigpending(); ++} ++ ++/* ++ * context_switch - switch to the new MM and the new thread's register state. ++ */ ++static __always_inline struct rq * ++context_switch(struct rq *rq, struct task_struct *prev, ++ struct task_struct *next) ++{ ++ prepare_task_switch(rq, prev, next); ++ ++ /* ++ * For paravirt, this is coupled with an exit in switch_to to ++ * combine the page table reload and the switch backend into ++ * one hypercall. ++ */ ++ arch_start_context_switch(prev); ++ ++ /* ++ * kernel -> kernel lazy + transfer active ++ * user -> kernel lazy + mmgrab() active ++ * ++ * kernel -> user switch + mmdrop() active ++ * user -> user switch ++ */ ++ if (!next->mm) { // to kernel ++ enter_lazy_tlb(prev->active_mm, next); ++ ++ next->active_mm = prev->active_mm; ++ if (prev->mm) // from user ++ mmgrab(prev->active_mm); ++ else ++ prev->active_mm = NULL; ++ } else { // to user ++ membarrier_switch_mm(rq, prev->active_mm, next->mm); ++ /* ++ * sys_membarrier() requires an smp_mb() between setting ++ * rq->curr / membarrier_switch_mm() and returning to userspace. ++ * ++ * The below provides this either through switch_mm(), or in ++ * case 'prev->active_mm == next->mm' through ++ * finish_task_switch()'s mmdrop(). ++ */ ++ switch_mm_irqs_off(prev->active_mm, next->mm, next); ++ ++ if (!prev->mm) { // from kernel ++ /* will mmdrop() in finish_task_switch(). */ ++ rq->prev_mm = prev->active_mm; ++ prev->active_mm = NULL; ++ } ++ } ++ ++ prepare_lock_switch(rq, next); ++ ++ /* Here we just switch the register state and the stack. */ ++ switch_to(prev, next, prev); ++ barrier(); ++ ++ return finish_task_switch(prev); ++} ++ ++/* ++ * nr_running, nr_uninterruptible and nr_context_switches: ++ * ++ * externally visible scheduler statistics: current number of runnable ++ * threads, total number of context switches performed since bootup. ++ */ ++unsigned long nr_running(void) ++{ ++ unsigned long i, sum = 0; ++ ++ for_each_online_cpu(i) ++ sum += cpu_rq(i)->nr_running; ++ ++ return sum; ++} ++ ++/* ++ * Check if only the current task is running on the CPU. ++ * ++ * Caution: this function does not check that the caller has disabled ++ * preemption, thus the result might have a time-of-check-to-time-of-use ++ * race. The caller is responsible to use it correctly, for example: ++ * ++ * - from a non-preemptible section (of course) ++ * ++ * - from a thread that is bound to a single CPU ++ * ++ * - in a loop with very short iterations (e.g. a polling loop) ++ */ ++bool single_task_running(void) ++{ ++ return raw_rq()->nr_running == 1; ++} ++EXPORT_SYMBOL(single_task_running); ++ ++unsigned long long nr_context_switches(void) ++{ ++ int i; ++ unsigned long long sum = 0; ++ ++ for_each_possible_cpu(i) ++ sum += cpu_rq(i)->nr_switches; ++ ++ return sum; ++} ++ ++/* ++ * Consumers of these two interfaces, like for example the cpuidle menu ++ * governor, are using nonsensical data. Preferring shallow idle state selection ++ * for a CPU that has IO-wait which might not even end up running the task when ++ * it does become runnable. ++ */ ++ ++unsigned long nr_iowait_cpu(int cpu) ++{ ++ return atomic_read(&cpu_rq(cpu)->nr_iowait); ++} ++ ++/* ++ * IO-wait accounting, and how its mostly bollocks (on SMP). ++ * ++ * The idea behind IO-wait account is to account the idle time that we could ++ * have spend running if it were not for IO. That is, if we were to improve the ++ * storage performance, we'd have a proportional reduction in IO-wait time. ++ * ++ * This all works nicely on UP, where, when a task blocks on IO, we account ++ * idle time as IO-wait, because if the storage were faster, it could've been ++ * running and we'd not be idle. ++ * ++ * This has been extended to SMP, by doing the same for each CPU. This however ++ * is broken. ++ * ++ * Imagine for instance the case where two tasks block on one CPU, only the one ++ * CPU will have IO-wait accounted, while the other has regular idle. Even ++ * though, if the storage were faster, both could've ran at the same time, ++ * utilising both CPUs. ++ * ++ * This means, that when looking globally, the current IO-wait accounting on ++ * SMP is a lower bound, by reason of under accounting. ++ * ++ * Worse, since the numbers are provided per CPU, they are sometimes ++ * interpreted per CPU, and that is nonsensical. A blocked task isn't strictly ++ * associated with any one particular CPU, it can wake to another CPU than it ++ * blocked on. This means the per CPU IO-wait number is meaningless. ++ * ++ * Task CPU affinities can make all that even more 'interesting'. ++ */ ++ ++unsigned long nr_iowait(void) ++{ ++ unsigned long i, sum = 0; ++ ++ for_each_possible_cpu(i) ++ sum += nr_iowait_cpu(i); ++ ++ return sum; ++} ++ ++DEFINE_PER_CPU(struct kernel_stat, kstat); ++DEFINE_PER_CPU(struct kernel_cpustat, kernel_cpustat); ++ ++EXPORT_PER_CPU_SYMBOL(kstat); ++EXPORT_PER_CPU_SYMBOL(kernel_cpustat); ++ ++static inline void pds_update_curr(struct rq *rq, struct task_struct *p) ++{ ++ s64 ns = rq->clock_task - p->last_ran; ++ ++ p->sched_time += ns; ++ account_group_exec_runtime(p, ns); ++ ++ /* time_slice accounting is done in usecs to avoid overflow on 32bit */ ++ p->time_slice -= NS_TO_US(ns); ++ p->last_ran = rq->clock_task; ++} ++ ++/* ++ * Return accounted runtime for the task. ++ * Return separately the current's pending runtime that have not been ++ * accounted yet. ++ */ ++unsigned long long task_sched_runtime(struct task_struct *p) ++{ ++ unsigned long flags; ++ struct rq *rq; ++ raw_spinlock_t *lock; ++ u64 ns; ++ ++#if defined(CONFIG_64BIT) && defined(CONFIG_SMP) ++ /* ++ * 64-bit doesn't need locks to atomically read a 64-bit value. ++ * So we have a optimization chance when the task's delta_exec is 0. ++ * Reading ->on_cpu is racy, but this is ok. ++ * ++ * If we race with it leaving CPU, we'll take a lock. So we're correct. ++ * If we race with it entering CPU, unaccounted time is 0. This is ++ * indistinguishable from the read occurring a few cycles earlier. ++ * If we see ->on_cpu without ->on_rq, the task is leaving, and has ++ * been accounted, so we're correct here as well. ++ */ ++ if (!p->on_cpu || !task_on_rq_queued(p)) ++ return tsk_seruntime(p); ++#endif ++ ++ rq = task_access_lock_irqsave(p, &lock, &flags); ++ /* ++ * Must be ->curr _and_ ->on_rq. If dequeued, we would ++ * project cycles that may never be accounted to this ++ * thread, breaking clock_gettime(). ++ */ ++ if (p == rq->curr && task_on_rq_queued(p)) { ++ update_rq_clock(rq); ++ pds_update_curr(rq, p); ++ } ++ ns = tsk_seruntime(p); ++ task_access_unlock_irqrestore(p, lock, &flags); ++ ++ return ns; ++} ++ ++/* This manages tasks that have run out of timeslice during a scheduler_tick */ ++static inline void pds_scheduler_task_tick(struct rq *rq) ++{ ++ struct task_struct *p = rq->curr; ++ ++ if (is_idle_task(p)) ++ return; ++ ++ pds_update_curr(rq, p); ++ ++ cpufreq_update_util(rq, 0); ++ ++ /* ++ * Tasks that were scheduled in the first half of a tick are not ++ * allowed to run into the 2nd half of the next tick if they will ++ * run out of time slice in the interim. Otherwise, if they have ++ * less than RESCHED_US μs of time slice left they will be rescheduled. ++ */ ++ if (p->time_slice - rq->dither >= RESCHED_US) ++ return; ++ ++ /** ++ * p->time_slice < RESCHED_US. We will modify task_struct under ++ * rq lock as p is rq->curr ++ */ ++ __set_tsk_resched(p); ++} ++ ++#ifdef CONFIG_SMP ++ ++#ifdef CONFIG_SCHED_SMT ++static int active_load_balance_cpu_stop(void *data) ++{ ++ struct rq *rq = this_rq(); ++ struct task_struct *p = data; ++ int cpu; ++ unsigned long flags; ++ ++ local_irq_save(flags); ++ ++ raw_spin_lock(&p->pi_lock); ++ raw_spin_lock(&rq->lock); ++ ++ rq->active_balance = 0; ++ /* ++ * _something_ may have changed the task, double check again ++ */ ++ if (task_on_rq_queued(p) && task_rq(p) == rq && ++ (cpu = cpumask_any_and(&p->cpus_mask, &sched_cpu_sg_idle_mask)) < nr_cpu_ids) ++ rq = __migrate_task(rq, p, cpu); ++ ++ raw_spin_unlock(&rq->lock); ++ raw_spin_unlock(&p->pi_lock); ++ ++ local_irq_restore(flags); ++ ++ return 0; ++} ++ ++/* pds_sg_balance_trigger - trigger slibing group balance for @cpu */ ++static void pds_sg_balance_trigger(const int cpu) ++{ ++ struct rq *rq = cpu_rq(cpu); ++ unsigned long flags; ++ struct task_struct *curr; ++ ++ if (!raw_spin_trylock_irqsave(&rq->lock, flags)) ++ return; ++ curr = rq->curr; ++ if (!is_idle_task(curr) && ++ cpumask_intersects(&curr->cpus_mask, &sched_cpu_sg_idle_mask)) { ++ int active_balance = 0; ++ ++ if (likely(!rq->active_balance)) { ++ rq->active_balance = 1; ++ active_balance = 1; ++ } ++ ++ raw_spin_unlock_irqrestore(&rq->lock, flags); ++ ++ if (likely(active_balance)) ++ stop_one_cpu_nowait(cpu, active_load_balance_cpu_stop, ++ curr, &rq->active_balance_work); ++ } else ++ raw_spin_unlock_irqrestore(&rq->lock, flags); ++} ++ ++/* ++ * pds_sg_balance_check - slibing group balance check for run queue @rq ++ */ ++static inline void pds_sg_balance_check(const struct rq *rq) ++{ ++ cpumask_t chk; ++ int i; ++ ++ /* Only online cpu will do sg balance checking */ ++ if (unlikely(!rq->online)) ++ return; ++ ++ /* Only cpu in slibing idle group will do the checking */ ++ if (!cpumask_test_cpu(cpu_of(rq), &sched_cpu_sg_idle_mask)) ++ return; ++ ++ /* Find potential cpus which can migrate the currently running task */ ++ if (!cpumask_andnot(&chk, &sched_rq_pending_masks[SCHED_RQ_EMPTY], ++ &sched_rq_queued_masks[SCHED_RQ_EMPTY])) ++ return; ++ ++ for_each_cpu(i, &chk) { ++ /* skip the cpu which has idle slibing cpu */ ++ if (cpumask_test_cpu(per_cpu(sched_sibling_cpu, i), ++ &sched_rq_queued_masks[SCHED_RQ_EMPTY])) ++ continue; ++ pds_sg_balance_trigger(i); ++ } ++} ++#endif /* CONFIG_SCHED_SMT */ ++#endif /* CONFIG_SMP */ ++ ++/* ++ * This function gets called by the timer code, with HZ frequency. ++ * We call it with interrupts disabled. ++ */ ++void scheduler_tick(void) ++{ ++ int cpu __maybe_unused = smp_processor_id(); ++ struct rq *rq = cpu_rq(cpu); ++ ++ sched_clock_tick(); ++ ++ raw_spin_lock(&rq->lock); ++ update_rq_clock(rq); ++ ++ pds_scheduler_task_tick(rq); ++ update_sched_rq_queued_masks_normal(rq); ++ calc_global_load_tick(rq); ++ psi_task_tick(rq); ++ ++ rq->last_tick = rq->clock; ++ raw_spin_unlock(&rq->lock); ++ ++ perf_event_task_tick(); ++} ++ ++#ifdef CONFIG_NO_HZ_FULL ++struct tick_work { ++ int cpu; ++ atomic_t state; ++ struct delayed_work work; ++}; ++/* Values for ->state, see diagram below. */ ++#define TICK_SCHED_REMOTE_OFFLINE 0 ++#define TICK_SCHED_REMOTE_OFFLINING 1 ++#define TICK_SCHED_REMOTE_RUNNING 2 ++ ++/* ++ * State diagram for ->state: ++ * ++ * ++ * TICK_SCHED_REMOTE_OFFLINE ++ * | ^ ++ * | | ++ * | | sched_tick_remote() ++ * | | ++ * | | ++ * +--TICK_SCHED_REMOTE_OFFLINING ++ * | ^ ++ * | | ++ * sched_tick_start() | | sched_tick_stop() ++ * | | ++ * V | ++ * TICK_SCHED_REMOTE_RUNNING ++ * ++ * ++ * Other transitions get WARN_ON_ONCE(), except that sched_tick_remote() ++ * and sched_tick_start() are happy to leave the state in RUNNING. ++ */ ++ ++static struct tick_work __percpu *tick_work_cpu; ++ ++static void sched_tick_remote(struct work_struct *work) ++{ ++ struct delayed_work *dwork = to_delayed_work(work); ++ struct tick_work *twork = container_of(dwork, struct tick_work, work); ++ int cpu = twork->cpu; ++ struct rq *rq = cpu_rq(cpu); ++ struct task_struct *curr; ++ unsigned long flags; ++ u64 delta; ++ int os; ++ ++ /* ++ * Handle the tick only if it appears the remote CPU is running in full ++ * dynticks mode. The check is racy by nature, but missing a tick or ++ * having one too much is no big deal because the scheduler tick updates ++ * statistics and checks timeslices in a time-independent way, regardless ++ * of when exactly it is running. ++ */ ++ if (idle_cpu(cpu) || !tick_nohz_tick_stopped_cpu(cpu)) ++ goto out_requeue; ++ ++ raw_spin_lock_irqsave(&rq->lock, flags); ++ curr = rq->curr; ++ ++ if (is_idle_task(curr) || cpu_is_offline(cpu)) ++ goto out_unlock; ++ ++ update_rq_clock(rq); ++ delta = rq_clock_task(rq) - curr->last_ran; ++ ++ /* ++ * Make sure the next tick runs within a reasonable ++ * amount of time. ++ */ ++ WARN_ON_ONCE(delta > (u64)NSEC_PER_SEC * 3); ++ pds_scheduler_task_tick(rq); ++ update_sched_rq_queued_masks_normal(rq); ++ ++out_unlock: ++ raw_spin_unlock_irqrestore(&rq->lock, flags); ++ ++out_requeue: ++ /* ++ * Run the remote tick once per second (1Hz). This arbitrary ++ * frequency is large enough to avoid overload but short enough ++ * to keep scheduler internal stats reasonably up to date. But ++ * first update state to reflect hotplug activity if required. ++ */ ++ os = atomic_fetch_add_unless(&twork->state, -1, TICK_SCHED_REMOTE_RUNNING); ++ WARN_ON_ONCE(os == TICK_SCHED_REMOTE_OFFLINE); ++ if (os == TICK_SCHED_REMOTE_RUNNING) ++ queue_delayed_work(system_unbound_wq, dwork, HZ); ++} ++ ++static void sched_tick_start(int cpu) ++{ ++ int os; ++ struct tick_work *twork; ++ ++ if (housekeeping_cpu(cpu, HK_FLAG_TICK)) ++ return; ++ ++ WARN_ON_ONCE(!tick_work_cpu); ++ ++ twork = per_cpu_ptr(tick_work_cpu, cpu); ++ os = atomic_xchg(&twork->state, TICK_SCHED_REMOTE_RUNNING); ++ WARN_ON_ONCE(os == TICK_SCHED_REMOTE_RUNNING); ++ if (os == TICK_SCHED_REMOTE_OFFLINE) { ++ twork->cpu = cpu; ++ INIT_DELAYED_WORK(&twork->work, sched_tick_remote); ++ queue_delayed_work(system_unbound_wq, &twork->work, HZ); ++ } ++} ++ ++#ifdef CONFIG_HOTPLUG_CPU ++static void sched_tick_stop(int cpu) ++{ ++ struct tick_work *twork; ++ ++ if (housekeeping_cpu(cpu, HK_FLAG_TICK)) ++ return; ++ ++ WARN_ON_ONCE(!tick_work_cpu); ++ ++ twork = per_cpu_ptr(tick_work_cpu, cpu); ++ cancel_delayed_work_sync(&twork->work); ++} ++#endif /* CONFIG_HOTPLUG_CPU */ ++ ++int __init sched_tick_offload_init(void) ++{ ++ tick_work_cpu = alloc_percpu(struct tick_work); ++ BUG_ON(!tick_work_cpu); ++ return 0; ++} ++ ++#else /* !CONFIG_NO_HZ_FULL */ ++static inline void sched_tick_start(int cpu) { } ++static inline void sched_tick_stop(int cpu) { } ++#endif ++ ++#if defined(CONFIG_PREEMPTION) && (defined(CONFIG_DEBUG_PREEMPT) || \ ++ defined(CONFIG_PREEMPT_TRACER)) ++/* ++ * If the value passed in is equal to the current preempt count ++ * then we just disabled preemption. Start timing the latency. ++ */ ++static inline void preempt_latency_start(int val) ++{ ++ if (preempt_count() == val) { ++ unsigned long ip = get_lock_parent_ip(); ++#ifdef CONFIG_DEBUG_PREEMPT ++ current->preempt_disable_ip = ip; ++#endif ++ trace_preempt_off(CALLER_ADDR0, ip); ++ } ++} ++ ++void preempt_count_add(int val) ++{ ++#ifdef CONFIG_DEBUG_PREEMPT ++ /* ++ * Underflow? ++ */ ++ if (DEBUG_LOCKS_WARN_ON((preempt_count() < 0))) ++ return; ++#endif ++ __preempt_count_add(val); ++#ifdef CONFIG_DEBUG_PREEMPT ++ /* ++ * Spinlock count overflowing soon? ++ */ ++ DEBUG_LOCKS_WARN_ON((preempt_count() & PREEMPT_MASK) >= ++ PREEMPT_MASK - 10); ++#endif ++ preempt_latency_start(val); ++} ++EXPORT_SYMBOL(preempt_count_add); ++NOKPROBE_SYMBOL(preempt_count_add); ++ ++/* ++ * If the value passed in equals to the current preempt count ++ * then we just enabled preemption. Stop timing the latency. ++ */ ++static inline void preempt_latency_stop(int val) ++{ ++ if (preempt_count() == val) ++ trace_preempt_on(CALLER_ADDR0, get_lock_parent_ip()); ++} ++ ++void preempt_count_sub(int val) ++{ ++#ifdef CONFIG_DEBUG_PREEMPT ++ /* ++ * Underflow? ++ */ ++ if (DEBUG_LOCKS_WARN_ON(val > preempt_count())) ++ return; ++ /* ++ * Is the spinlock portion underflowing? ++ */ ++ if (DEBUG_LOCKS_WARN_ON((val < PREEMPT_MASK) && ++ !(preempt_count() & PREEMPT_MASK))) ++ return; ++#endif ++ ++ preempt_latency_stop(val); ++ __preempt_count_sub(val); ++} ++EXPORT_SYMBOL(preempt_count_sub); ++NOKPROBE_SYMBOL(preempt_count_sub); ++ ++#else ++static inline void preempt_latency_start(int val) { } ++static inline void preempt_latency_stop(int val) { } ++#endif ++ ++/* ++ * Timeslices below RESCHED_US are considered as good as expired as there's no ++ * point rescheduling when there's so little time left. SCHED_BATCH tasks ++ * have been flagged be not latency sensitive and likely to be fully CPU ++ * bound so every time they're rescheduled they have their time_slice ++ * refilled, but get a new later deadline to have little effect on ++ * SCHED_NORMAL tasks. ++ ++ */ ++static inline void check_deadline(struct task_struct *p, struct rq *rq) ++{ ++ if (rq->idle == p) ++ return; ++ ++ pds_update_curr(rq, p); ++ ++ if (p->time_slice < RESCHED_US) { ++ time_slice_expired(p, rq); ++ if (SCHED_ISO == p->policy && ISO_PRIO == p->prio) { ++ p->prio = NORMAL_PRIO; ++ p->deadline = rq->clock + task_deadline_diff(p); ++ update_task_priodl(p); ++ } ++ if (SCHED_FIFO != p->policy && task_on_rq_queued(p)) ++ requeue_task(p, rq); ++ } ++} ++ ++#ifdef CONFIG_SMP ++ ++#define SCHED_RQ_NR_MIGRATION (32UL) ++/* ++ * Migrate pending tasks in @rq to @dest_cpu ++ * Will try to migrate mininal of half of @rq nr_running tasks and ++ * SCHED_RQ_NR_MIGRATION to @dest_cpu ++ */ ++static inline int ++migrate_pending_tasks(struct rq *rq, struct rq *dest_rq, int filter_prio) ++{ ++ struct task_struct *p; ++ int dest_cpu = cpu_of(dest_rq); ++ int nr_migrated = 0; ++ int nr_tries = min((rq->nr_running + 1) / 2, SCHED_RQ_NR_MIGRATION); ++ struct skiplist_node *node = rq->sl_header.next[0]; ++ ++ while (nr_tries && node != &rq->sl_header) { ++ p = skiplist_entry(node, struct task_struct, sl_node); ++ node = node->next[0]; ++ ++ if (task_running(p)) ++ continue; ++ if (p->prio >= filter_prio) ++ break; ++ if (cpumask_test_cpu(dest_cpu, &p->cpus_mask)) { ++ dequeue_task(p, rq, 0); ++ set_task_cpu(p, dest_cpu); ++ enqueue_task(p, dest_rq, 0); ++ nr_migrated++; ++ } ++ nr_tries--; ++ /* make a jump */ ++ if (node == &rq->sl_header) ++ break; ++ node = node->next[0]; ++ } ++ ++ return nr_migrated; ++} ++ ++static inline int ++take_queued_task_cpumask(struct rq *rq, cpumask_t *chk_mask, int filter_prio) ++{ ++ int src_cpu; ++ ++ for_each_cpu(src_cpu, chk_mask) { ++ int nr_migrated; ++ struct rq *src_rq = cpu_rq(src_cpu); ++ ++ if (!do_raw_spin_trylock(&src_rq->lock)) { ++ if (PRIO_LIMIT == filter_prio) ++ continue; ++ return 0; ++ } ++ spin_acquire(&src_rq->lock.dep_map, SINGLE_DEPTH_NESTING, 1, _RET_IP_); ++ ++ update_rq_clock(src_rq); ++ if ((nr_migrated = migrate_pending_tasks(src_rq, rq, filter_prio))) ++ cpufreq_update_this_cpu(rq, 0); ++ ++ spin_release(&src_rq->lock.dep_map, _RET_IP_); ++ do_raw_spin_unlock(&src_rq->lock); ++ ++ if (nr_migrated || PRIO_LIMIT != filter_prio) ++ return nr_migrated; ++ } ++ return 0; ++} ++ ++static inline int take_other_rq_task(struct rq *rq, int cpu, int filter_prio) ++{ ++ struct cpumask *affinity_mask, *end; ++ struct cpumask chk; ++ ++ if (PRIO_LIMIT == filter_prio) { ++ cpumask_complement(&chk, &sched_rq_pending_masks[SCHED_RQ_EMPTY]); ++#ifdef CONFIG_SMT_NICE ++ { ++ /* also try to take IDLE priority tasks from smt supressed cpu */ ++ struct cpumask t; ++ if (cpumask_and(&t, &sched_smt_supressed_mask, ++ &sched_rq_queued_masks[SCHED_RQ_IDLE])) ++ cpumask_or(&chk, &chk, &t); ++ } ++#endif ++ } else if (NORMAL_PRIO == filter_prio) { ++ cpumask_or(&chk, &sched_rq_pending_masks[SCHED_RQ_RT], ++ &sched_rq_pending_masks[SCHED_RQ_ISO]); ++ } else if (IDLE_PRIO == filter_prio) { ++ cpumask_complement(&chk, &sched_rq_pending_masks[SCHED_RQ_EMPTY]); ++ cpumask_andnot(&chk, &chk, &sched_rq_pending_masks[SCHED_RQ_IDLE]); ++ } else ++ cpumask_copy(&chk, &sched_rq_pending_masks[SCHED_RQ_RT]); ++ ++ if (cpumask_empty(&chk)) ++ return 0; ++ ++ affinity_mask = per_cpu(sched_cpu_llc_start_mask, cpu); ++ end = per_cpu(sched_cpu_affinity_chk_end_masks, cpu); ++ do { ++ struct cpumask tmp; ++ ++ if (cpumask_and(&tmp, &chk, affinity_mask) && ++ take_queued_task_cpumask(rq, &tmp, filter_prio)) ++ return 1; ++ } while (++affinity_mask < end); ++ ++ return 0; ++} ++#endif ++ ++static inline struct task_struct * ++choose_next_task(struct rq *rq, int cpu, struct task_struct *prev) ++{ ++ struct task_struct *next = rq_first_queued_task(rq); ++ ++#ifdef CONFIG_SMT_NICE ++ if (cpumask_test_cpu(cpu, &sched_smt_supressed_mask)) { ++ if (next->prio >= IDLE_PRIO) { ++ if (rq->online && ++ take_other_rq_task(rq, cpu, IDLE_PRIO)) ++ return rq_first_queued_task(rq); ++ return rq->idle; ++ } ++ } ++#endif ++ ++#ifdef CONFIG_SMP ++ if (likely(rq->online)) ++ if (take_other_rq_task(rq, cpu, next->prio)) { ++ resched_curr(rq); ++ return rq_first_queued_task(rq); ++ } ++#endif ++ return next; ++} ++ ++static inline unsigned long get_preempt_disable_ip(struct task_struct *p) ++{ ++#ifdef CONFIG_DEBUG_PREEMPT ++ return p->preempt_disable_ip; ++#else ++ return 0; ++#endif ++} ++ ++/* ++ * Print scheduling while atomic bug: ++ */ ++static noinline void __schedule_bug(struct task_struct *prev) ++{ ++ /* Save this before calling printk(), since that will clobber it */ ++ unsigned long preempt_disable_ip = get_preempt_disable_ip(current); ++ ++ if (oops_in_progress) ++ return; ++ ++ printk(KERN_ERR "BUG: scheduling while atomic: %s/%d/0x%08x\n", ++ prev->comm, prev->pid, preempt_count()); ++ ++ debug_show_held_locks(prev); ++ print_modules(); ++ if (irqs_disabled()) ++ print_irqtrace_events(prev); ++ if (IS_ENABLED(CONFIG_DEBUG_PREEMPT) ++ && in_atomic_preempt_off()) { ++ pr_err("Preemption disabled at:"); ++ print_ip_sym(preempt_disable_ip); ++ pr_cont("\n"); ++ } ++ if (panic_on_warn) ++ panic("scheduling while atomic\n"); ++ ++ dump_stack(); ++ add_taint(TAINT_WARN, LOCKDEP_STILL_OK); ++} ++ ++/* ++ * Various schedule()-time debugging checks and statistics: ++ */ ++static inline void schedule_debug(struct task_struct *prev, bool preempt) ++{ ++#ifdef CONFIG_SCHED_STACK_END_CHECK ++ if (task_stack_end_corrupted(prev)) ++ panic("corrupted stack end detected inside scheduler\n"); ++#endif ++ ++#ifdef CONFIG_DEBUG_ATOMIC_SLEEP ++ if (!preempt && prev->state && prev->non_block_count) { ++ printk(KERN_ERR "BUG: scheduling in a non-blocking section: %s/%d/%i\n", ++ prev->comm, prev->pid, prev->non_block_count); ++ dump_stack(); ++ add_taint(TAINT_WARN, LOCKDEP_STILL_OK); ++ } ++#endif ++ ++ if (unlikely(in_atomic_preempt_off())) { ++ __schedule_bug(prev); ++ preempt_count_set(PREEMPT_DISABLED); ++ } ++ rcu_sleep_check(); ++ ++ profile_hit(SCHED_PROFILING, __builtin_return_address(0)); ++ ++ schedstat_inc(this_rq()->sched_count); ++} ++ ++static inline void set_rq_task(struct rq *rq, struct task_struct *p) ++{ ++ p->last_ran = rq->clock_task; ++ ++#ifdef CONFIG_HIGH_RES_TIMERS ++ if (p != rq->idle) ++ hrtick_start(rq, US_TO_NS(p->time_slice)); ++#endif ++ /* update rq->dither */ ++ rq->dither = rq_dither(rq); ++} ++ ++/* ++ * schedule() is the main scheduler function. ++ * ++ * The main means of driving the scheduler and thus entering this function are: ++ * ++ * 1. Explicit blocking: mutex, semaphore, waitqueue, etc. ++ * ++ * 2. TIF_NEED_RESCHED flag is checked on interrupt and userspace return ++ * paths. For example, see arch/x86/entry_64.S. ++ * ++ * To drive preemption between tasks, the scheduler sets the flag in timer ++ * interrupt handler scheduler_tick(). ++ * ++ * 3. Wakeups don't really cause entry into schedule(). They add a ++ * task to the run-queue and that's it. ++ * ++ * Now, if the new task added to the run-queue preempts the current ++ * task, then the wakeup sets TIF_NEED_RESCHED and schedule() gets ++ * called on the nearest possible occasion: ++ * ++ * - If the kernel is preemptible (CONFIG_PREEMPTION=y): ++ * ++ * - in syscall or exception context, at the next outmost ++ * preempt_enable(). (this might be as soon as the wake_up()'s ++ * spin_unlock()!) ++ * ++ * - in IRQ context, return from interrupt-handler to ++ * preemptible context ++ * ++ * - If the kernel is not preemptible (CONFIG_PREEMPT is not set) ++ * then at the next: ++ * ++ * - cond_resched() call ++ * - explicit schedule() call ++ * - return from syscall or exception to user-space ++ * - return from interrupt-handler to user-space ++ * ++ * WARNING: must be called with preemption disabled! ++ */ ++static void __sched notrace __schedule(bool preempt) ++{ ++ struct task_struct *prev, *next; ++ unsigned long *switch_count; ++ struct rq *rq; ++ int cpu; ++ ++ cpu = smp_processor_id(); ++ rq = cpu_rq(cpu); ++ prev = rq->curr; ++ ++ schedule_debug(prev, preempt); ++ ++ /* by passing sched_feat(HRTICK) checking which PDS doesn't support */ ++ hrtick_clear(rq); ++ ++ local_irq_disable(); ++ rcu_note_context_switch(preempt); ++ ++ /* ++ * Make sure that signal_pending_state()->signal_pending() below ++ * can't be reordered with __set_current_state(TASK_INTERRUPTIBLE) ++ * done by the caller to avoid the race with signal_wake_up(). ++ * ++ * The membarrier system call requires a full memory barrier ++ * after coming from user-space, before storing to rq->curr. ++ */ ++ raw_spin_lock(&rq->lock); ++ smp_mb__after_spinlock(); ++ ++ update_rq_clock(rq); ++ ++ switch_count = &prev->nivcsw; ++ if (!preempt && prev->state) { ++ if (signal_pending_state(prev->state, prev)) { ++ prev->state = TASK_RUNNING; ++ } else { ++ deactivate_task(prev, rq); ++ ++ if (prev->in_iowait) { ++ atomic_inc(&rq->nr_iowait); ++ delayacct_blkio_start(); ++ } ++ } ++ switch_count = &prev->nvcsw; ++ } ++ ++ clear_tsk_need_resched(prev); ++ clear_preempt_need_resched(); ++ ++ check_deadline(prev, rq); ++ ++ next = choose_next_task(rq, cpu, prev); ++ ++ set_rq_task(rq, next); ++ ++ if (prev != next) { ++ if (next->prio == PRIO_LIMIT) ++ schedstat_inc(rq->sched_goidle); ++ ++ /* ++ * RCU users of rcu_dereference(rq->curr) may not see ++ * changes to task_struct made by pick_next_task(). ++ */ ++ RCU_INIT_POINTER(rq->curr, next); ++ /* ++ * The membarrier system call requires each architecture ++ * to have a full memory barrier after updating ++ * rq->curr, before returning to user-space. ++ * ++ * Here are the schemes providing that barrier on the ++ * various architectures: ++ * - mm ? switch_mm() : mmdrop() for x86, s390, sparc, PowerPC. ++ * switch_mm() rely on membarrier_arch_switch_mm() on PowerPC. ++ * - finish_lock_switch() for weakly-ordered ++ * architectures where spin_unlock is a full barrier, ++ * - switch_to() for arm64 (weakly-ordered, spin_unlock ++ * is a RELEASE barrier), ++ */ ++ ++*switch_count; ++ rq->nr_switches++; ++ ++ trace_sched_switch(preempt, prev, next); ++ ++ /* Also unlocks the rq: */ ++ rq = context_switch(rq, prev, next); ++#ifdef CONFIG_SCHED_SMT ++ pds_sg_balance_check(rq); ++#endif ++ } else ++ raw_spin_unlock_irq(&rq->lock); ++} ++ ++void __noreturn do_task_dead(void) ++{ ++ /* Causes final put_task_struct in finish_task_switch(): */ ++ set_special_state(TASK_DEAD); ++ ++ /* Tell freezer to ignore us: */ ++ current->flags |= PF_NOFREEZE; ++ __schedule(false); ++ ++ BUG(); ++ ++ /* Avoid "noreturn function does return" - but don't continue if BUG() is a NOP: */ ++ for (;;) ++ cpu_relax(); ++} ++ ++static inline void sched_submit_work(struct task_struct *tsk) ++{ ++ if (!tsk->state || tsk_is_pi_blocked(tsk) || ++ signal_pending_state(tsk->state, tsk)) ++ return; ++ ++ /* ++ * If a worker went to sleep, notify and ask workqueue whether ++ * it wants to wake up a task to maintain concurrency. ++ * As this function is called inside the schedule() context, ++ * we disable preemption to avoid it calling schedule() again ++ * in the possible wakeup of a kworker. ++ */ ++ if (tsk->flags & (PF_WQ_WORKER | PF_IO_WORKER)) { ++ preempt_disable(); ++ if (tsk->flags & PF_WQ_WORKER) ++ wq_worker_sleeping(tsk); ++ else ++ io_wq_worker_sleeping(tsk); ++ preempt_enable_no_resched(); ++ } ++ ++ /* ++ * If we are going to sleep and we have plugged IO queued, ++ * make sure to submit it to avoid deadlocks. ++ */ ++ if (blk_needs_flush_plug(tsk)) ++ blk_schedule_flush_plug(tsk); ++} ++ ++static void sched_update_worker(struct task_struct *tsk) ++{ ++ if (tsk->flags & (PF_WQ_WORKER | PF_IO_WORKER)) { ++ if (tsk->flags & PF_WQ_WORKER) ++ wq_worker_running(tsk); ++ else ++ io_wq_worker_running(tsk); ++ } ++} ++ ++asmlinkage __visible void __sched schedule(void) ++{ ++ struct task_struct *tsk = current; ++ ++ sched_submit_work(tsk); ++ do { ++ preempt_disable(); ++ __schedule(false); ++ sched_preempt_enable_no_resched(); ++ } while (need_resched()); ++ sched_update_worker(tsk); ++} ++EXPORT_SYMBOL(schedule); ++ ++/* ++ * synchronize_rcu_tasks() makes sure that no task is stuck in preempted ++ * state (have scheduled out non-voluntarily) by making sure that all ++ * tasks have either left the run queue or have gone into user space. ++ * As idle tasks do not do either, they must not ever be preempted ++ * (schedule out non-voluntarily). ++ * ++ * schedule_idle() is similar to schedule_preempt_disable() except that it ++ * never enables preemption because it does not call sched_submit_work(). ++ */ ++void __sched schedule_idle(void) ++{ ++ /* ++ * As this skips calling sched_submit_work(), which the idle task does ++ * regardless because that function is a nop when the task is in a ++ * TASK_RUNNING state, make sure this isn't used someplace that the ++ * current task can be in any other state. Note, idle is always in the ++ * TASK_RUNNING state. ++ */ ++ WARN_ON_ONCE(current->state); ++ do { ++ __schedule(false); ++ } while (need_resched()); ++} ++ ++#ifdef CONFIG_CONTEXT_TRACKING ++asmlinkage __visible void __sched schedule_user(void) ++{ ++ /* ++ * If we come here after a random call to set_need_resched(), ++ * or we have been woken up remotely but the IPI has not yet arrived, ++ * we haven't yet exited the RCU idle mode. Do it here manually until ++ * we find a better solution. ++ * ++ * NB: There are buggy callers of this function. Ideally we ++ * should warn if prev_state != CONTEXT_USER, but that will trigger ++ * too frequently to make sense yet. ++ */ ++ enum ctx_state prev_state = exception_enter(); ++ schedule(); ++ exception_exit(prev_state); ++} ++#endif ++ ++/** ++ * schedule_preempt_disabled - called with preemption disabled ++ * ++ * Returns with preemption disabled. Note: preempt_count must be 1 ++ */ ++void __sched schedule_preempt_disabled(void) ++{ ++ sched_preempt_enable_no_resched(); ++ schedule(); ++ preempt_disable(); ++} ++ ++static void __sched notrace preempt_schedule_common(void) ++{ ++ do { ++ /* ++ * Because the function tracer can trace preempt_count_sub() ++ * and it also uses preempt_enable/disable_notrace(), if ++ * NEED_RESCHED is set, the preempt_enable_notrace() called ++ * by the function tracer will call this function again and ++ * cause infinite recursion. ++ * ++ * Preemption must be disabled here before the function ++ * tracer can trace. Break up preempt_disable() into two ++ * calls. One to disable preemption without fear of being ++ * traced. The other to still record the preemption latency, ++ * which can also be traced by the function tracer. ++ */ ++ preempt_disable_notrace(); ++ preempt_latency_start(1); ++ __schedule(true); ++ preempt_latency_stop(1); ++ preempt_enable_no_resched_notrace(); ++ ++ /* ++ * Check again in case we missed a preemption opportunity ++ * between schedule and now. ++ */ ++ } while (need_resched()); ++} ++ ++#ifdef CONFIG_PREEMPTION ++/* ++ * This is the entry point to schedule() from in-kernel preemption ++ * off of preempt_enable. ++ */ ++asmlinkage __visible void __sched notrace preempt_schedule(void) ++{ ++ /* ++ * If there is a non-zero preempt_count or interrupts are disabled, ++ * we do not want to preempt the current task. Just return.. ++ */ ++ if (likely(!preemptible())) ++ return; ++ ++ preempt_schedule_common(); ++} ++NOKPROBE_SYMBOL(preempt_schedule); ++EXPORT_SYMBOL(preempt_schedule); ++ ++/** ++ * preempt_schedule_notrace - preempt_schedule called by tracing ++ * ++ * The tracing infrastructure uses preempt_enable_notrace to prevent ++ * recursion and tracing preempt enabling caused by the tracing ++ * infrastructure itself. But as tracing can happen in areas coming ++ * from userspace or just about to enter userspace, a preempt enable ++ * can occur before user_exit() is called. This will cause the scheduler ++ * to be called when the system is still in usermode. ++ * ++ * To prevent this, the preempt_enable_notrace will use this function ++ * instead of preempt_schedule() to exit user context if needed before ++ * calling the scheduler. ++ */ ++asmlinkage __visible void __sched notrace preempt_schedule_notrace(void) ++{ ++ enum ctx_state prev_ctx; ++ ++ if (likely(!preemptible())) ++ return; ++ ++ do { ++ /* ++ * Because the function tracer can trace preempt_count_sub() ++ * and it also uses preempt_enable/disable_notrace(), if ++ * NEED_RESCHED is set, the preempt_enable_notrace() called ++ * by the function tracer will call this function again and ++ * cause infinite recursion. ++ * ++ * Preemption must be disabled here before the function ++ * tracer can trace. Break up preempt_disable() into two ++ * calls. One to disable preemption without fear of being ++ * traced. The other to still record the preemption latency, ++ * which can also be traced by the function tracer. ++ */ ++ preempt_disable_notrace(); ++ preempt_latency_start(1); ++ /* ++ * Needs preempt disabled in case user_exit() is traced ++ * and the tracer calls preempt_enable_notrace() causing ++ * an infinite recursion. ++ */ ++ prev_ctx = exception_enter(); ++ __schedule(true); ++ exception_exit(prev_ctx); ++ ++ preempt_latency_stop(1); ++ preempt_enable_no_resched_notrace(); ++ } while (need_resched()); ++} ++EXPORT_SYMBOL_GPL(preempt_schedule_notrace); ++ ++#endif /* CONFIG_PREEMPTION */ ++ ++/* ++ * This is the entry point to schedule() from kernel preemption ++ * off of irq context. ++ * Note, that this is called and return with irqs disabled. This will ++ * protect us against recursive calling from irq. ++ */ ++asmlinkage __visible void __sched preempt_schedule_irq(void) ++{ ++ enum ctx_state prev_state; ++ ++ /* Catch callers which need to be fixed */ ++ BUG_ON(preempt_count() || !irqs_disabled()); ++ ++ prev_state = exception_enter(); ++ ++ do { ++ preempt_disable(); ++ local_irq_enable(); ++ __schedule(true); ++ local_irq_disable(); ++ sched_preempt_enable_no_resched(); ++ } while (need_resched()); ++ ++ exception_exit(prev_state); ++} ++ ++int default_wake_function(wait_queue_entry_t *curr, unsigned mode, int wake_flags, ++ void *key) ++{ ++ return try_to_wake_up(curr->private, mode, wake_flags); ++} ++EXPORT_SYMBOL(default_wake_function); ++ ++static inline void ++check_task_changed(struct rq *rq, struct task_struct *p) ++{ ++ /* ++ * Trigger changes when task priority/deadline modified. ++ */ ++ if (task_on_rq_queued(p)) { ++ struct task_struct *first; ++ ++ requeue_task(p, rq); ++ ++ /* Resched if first queued task not running and not IDLE */ ++ if ((first = rq_first_queued_task(rq)) != rq->curr && ++ !task_running_idle(first)) ++ resched_curr(rq); ++ } ++} ++ ++#ifdef CONFIG_RT_MUTEXES ++ ++static inline int __rt_effective_prio(struct task_struct *pi_task, int prio) ++{ ++ if (pi_task) ++ prio = min(prio, pi_task->prio); ++ ++ return prio; ++} ++ ++static inline int rt_effective_prio(struct task_struct *p, int prio) ++{ ++ struct task_struct *pi_task = rt_mutex_get_top_task(p); ++ ++ return __rt_effective_prio(pi_task, prio); ++} ++ ++/* ++ * rt_mutex_setprio - set the current priority of a task ++ * @p: task to boost ++ * @pi_task: donor task ++ * ++ * This function changes the 'effective' priority of a task. It does ++ * not touch ->normal_prio like __setscheduler(). ++ * ++ * Used by the rt_mutex code to implement priority inheritance ++ * logic. Call site only calls if the priority of the task changed. ++ */ ++void rt_mutex_setprio(struct task_struct *p, struct task_struct *pi_task) ++{ ++ int prio; ++ struct rq *rq; ++ raw_spinlock_t *lock; ++ ++ /* XXX used to be waiter->prio, not waiter->task->prio */ ++ prio = __rt_effective_prio(pi_task, p->normal_prio); ++ ++ /* ++ * If nothing changed; bail early. ++ */ ++ if (p->pi_top_task == pi_task && prio == p->prio) ++ return; ++ ++ rq = __task_access_lock(p, &lock); ++ /* ++ * Set under pi_lock && rq->lock, such that the value can be used under ++ * either lock. ++ * ++ * Note that there is loads of tricky to make this pointer cache work ++ * right. rt_mutex_slowunlock()+rt_mutex_postunlock() work together to ++ * ensure a task is de-boosted (pi_task is set to NULL) before the ++ * task is allowed to run again (and can exit). This ensures the pointer ++ * points to a blocked task -- which guaratees the task is present. ++ */ ++ p->pi_top_task = pi_task; ++ ++ /* ++ * For FIFO/RR we only need to set prio, if that matches we're done. ++ */ ++ if (prio == p->prio) ++ goto out_unlock; ++ ++ /* ++ * Idle task boosting is a nono in general. There is one ++ * exception, when PREEMPT_RT and NOHZ is active: ++ * ++ * The idle task calls get_next_timer_interrupt() and holds ++ * the timer wheel base->lock on the CPU and another CPU wants ++ * to access the timer (probably to cancel it). We can safely ++ * ignore the boosting request, as the idle CPU runs this code ++ * with interrupts disabled and will complete the lock ++ * protected section without being interrupted. So there is no ++ * real need to boost. ++ */ ++ if (unlikely(p == rq->idle)) { ++ WARN_ON(p != rq->curr); ++ WARN_ON(p->pi_blocked_on); ++ goto out_unlock; ++ } ++ ++ trace_sched_pi_setprio(p, pi_task); ++ p->prio = prio; ++ update_task_priodl(p); ++ ++ check_task_changed(rq, p); ++ ++out_unlock: ++ __task_access_unlock(p, lock); ++} ++#else ++static inline int rt_effective_prio(struct task_struct *p, int prio) ++{ ++ return prio; ++} ++#endif ++ ++void set_user_nice(struct task_struct *p, long nice) ++{ ++ int new_static; ++ unsigned long flags; ++ struct rq *rq; ++ raw_spinlock_t *lock; ++ ++ if (task_nice(p) == nice || nice < MIN_NICE || nice > MAX_NICE) ++ return; ++ new_static = NICE_TO_PRIO(nice); ++ /* ++ * We have to be careful, if called from sys_setpriority(), ++ * the task might be in the middle of scheduling on another CPU. ++ */ ++ raw_spin_lock_irqsave(&p->pi_lock, flags); ++ rq = __task_access_lock(p, &lock); ++ ++ /* rq lock may not held!! */ ++ update_rq_clock(rq); ++ ++ p->static_prio = new_static; ++ /* ++ * The RT priorities are set via sched_setscheduler(), but we still ++ * allow the 'normal' nice value to be set - but as expected ++ * it wont have any effect on scheduling until the task is ++ * not SCHED_NORMAL/SCHED_BATCH: ++ */ ++ if (task_has_rt_policy(p)) ++ goto out_unlock; ++ ++ p->deadline -= task_deadline_diff(p); ++ p->deadline += static_deadline_diff(new_static); ++ p->prio = effective_prio(p); ++ update_task_priodl(p); ++ ++ check_task_changed(rq, p); ++out_unlock: ++ __task_access_unlock(p, lock); ++ raw_spin_unlock_irqrestore(&p->pi_lock, flags); ++} ++EXPORT_SYMBOL(set_user_nice); ++ ++/* ++ * can_nice - check if a task can reduce its nice value ++ * @p: task ++ * @nice: nice value ++ */ ++int can_nice(const struct task_struct *p, const int nice) ++{ ++ /* Convert nice value [19,-20] to rlimit style value [1,40] */ ++ int nice_rlim = nice_to_rlimit(nice); ++ ++ return (nice_rlim <= task_rlimit(p, RLIMIT_NICE) || ++ capable(CAP_SYS_NICE)); ++} ++ ++#ifdef __ARCH_WANT_SYS_NICE ++ ++/* ++ * sys_nice - change the priority of the current process. ++ * @increment: priority increment ++ * ++ * sys_setpriority is a more generic, but much slower function that ++ * does similar things. ++ */ ++SYSCALL_DEFINE1(nice, int, increment) ++{ ++ long nice, retval; ++ ++ /* ++ * Setpriority might change our priority at the same moment. ++ * We don't have to worry. Conceptually one call occurs first ++ * and we have a single winner. ++ */ ++ ++ increment = clamp(increment, -NICE_WIDTH, NICE_WIDTH); ++ nice = task_nice(current) + increment; ++ ++ nice = clamp_val(nice, MIN_NICE, MAX_NICE); ++ if (increment < 0 && !can_nice(current, nice)) ++ return -EPERM; ++ ++ retval = security_task_setnice(current, nice); ++ if (retval) ++ return retval; ++ ++ set_user_nice(current, nice); ++ return 0; ++} ++ ++#endif ++ ++/** ++ * task_prio - return the priority value of a given task. ++ * @p: the task in question. ++ * ++ * Return: The priority value as seen by users in /proc. ++ * RT tasks are offset by -100. Normal tasks are centered around 1, value goes ++ * from 0(SCHED_ISO) up to 82 (nice +19 SCHED_IDLE). ++ */ ++int task_prio(const struct task_struct *p) ++{ ++ int level, prio = p->prio - MAX_RT_PRIO; ++ static const int level_to_nice_prio[] = {39, 33, 26, 20, 14, 7, 0, 0}; ++ ++ /* rt tasks */ ++ if (prio <= 0) ++ goto out; ++ ++ preempt_disable(); ++ level = task_deadline_level(p, this_rq()); ++ preempt_enable(); ++ prio += level_to_nice_prio[level]; ++ if (idleprio_task(p)) ++ prio += NICE_WIDTH; ++out: ++ return prio; ++} ++ ++/** ++ * idle_cpu - is a given CPU idle currently? ++ * @cpu: the processor in question. ++ * ++ * Return: 1 if the CPU is currently idle. 0 otherwise. ++ */ ++int idle_cpu(int cpu) ++{ ++ return cpu_curr(cpu) == cpu_rq(cpu)->idle; ++} ++ ++/** ++ * idle_task - return the idle task for a given CPU. ++ * @cpu: the processor in question. ++ * ++ * Return: The idle task for the cpu @cpu. ++ */ ++struct task_struct *idle_task(int cpu) ++{ ++ return cpu_rq(cpu)->idle; ++} ++ ++/** ++ * find_process_by_pid - find a process with a matching PID value. ++ * @pid: the pid in question. ++ * ++ * The task of @pid, if found. %NULL otherwise. ++ */ ++static inline struct task_struct *find_process_by_pid(pid_t pid) ++{ ++ return pid ? find_task_by_vpid(pid) : current; ++} ++ ++#ifdef CONFIG_SMP ++void sched_set_stop_task(int cpu, struct task_struct *stop) ++{ ++ struct sched_param stop_param = { .sched_priority = STOP_PRIO }; ++ struct sched_param start_param = { .sched_priority = 0 }; ++ struct task_struct *old_stop = cpu_rq(cpu)->stop; ++ ++ if (stop) { ++ /* ++ * Make it appear like a SCHED_FIFO task, its something ++ * userspace knows about and won't get confused about. ++ * ++ * Also, it will make PI more or less work without too ++ * much confusion -- but then, stop work should not ++ * rely on PI working anyway. ++ */ ++ sched_setscheduler_nocheck(stop, SCHED_FIFO, &stop_param); ++ } ++ ++ cpu_rq(cpu)->stop = stop; ++ ++ if (old_stop) { ++ /* ++ * Reset it back to a normal scheduling policy so that ++ * it can die in pieces. ++ */ ++ sched_setscheduler_nocheck(old_stop, SCHED_NORMAL, &start_param); ++ } ++} ++ ++/* ++ * Change a given task's CPU affinity. Migrate the thread to a ++ * proper CPU and schedule it away if the CPU it's executing on ++ * is removed from the allowed bitmask. ++ * ++ * NOTE: the caller must have a valid reference to the task, the ++ * task must not exit() & deallocate itself prematurely. The ++ * call is not atomic; no spinlocks may be held. ++ */ ++static int __set_cpus_allowed_ptr(struct task_struct *p, ++ const struct cpumask *new_mask, bool check) ++{ ++ const struct cpumask *cpu_valid_mask = cpu_active_mask; ++ int dest_cpu; ++ unsigned long flags; ++ struct rq *rq; ++ raw_spinlock_t *lock; ++ int ret = 0; ++ ++ raw_spin_lock_irqsave(&p->pi_lock, flags); ++ rq = __task_access_lock(p, &lock); ++ ++ if (p->flags & PF_KTHREAD) { ++ /* ++ * Kernel threads are allowed on online && !active CPUs ++ */ ++ cpu_valid_mask = cpu_online_mask; ++ } ++ ++ /* ++ * Must re-check here, to close a race against __kthread_bind(), ++ * sched_setaffinity() is not guaranteed to observe the flag. ++ */ ++ if (check && (p->flags & PF_NO_SETAFFINITY)) { ++ ret = -EINVAL; ++ goto out; ++ } ++ ++ if (cpumask_equal(&p->cpus_mask, new_mask)) ++ goto out; ++ ++ dest_cpu = cpumask_any_and(cpu_valid_mask, new_mask); ++ if (dest_cpu >= nr_cpu_ids) { ++ ret = -EINVAL; ++ goto out; ++ } ++ ++ do_set_cpus_allowed(p, new_mask); ++ ++ if (p->flags & PF_KTHREAD) { ++ /* ++ * For kernel threads that do indeed end up on online && ++ * !active we want to ensure they are strict per-CPU threads. ++ */ ++ WARN_ON(cpumask_intersects(new_mask, cpu_online_mask) && ++ !cpumask_intersects(new_mask, cpu_active_mask) && ++ p->nr_cpus_allowed != 1); ++ } ++ ++ /* Can the task run on the task's current CPU? If so, we're done */ ++ if (cpumask_test_cpu(task_cpu(p), new_mask)) ++ goto out; ++ ++ if (task_running(p) || p->state == TASK_WAKING) { ++ struct migration_arg arg = { p, dest_cpu }; ++ ++ /* Need help from migration thread: drop lock and wait. */ ++ __task_access_unlock(p, lock); ++ raw_spin_unlock_irqrestore(&p->pi_lock, flags); ++ stop_one_cpu(cpu_of(rq), migration_cpu_stop, &arg); ++ return 0; ++ } ++ if (task_on_rq_queued(p)) { ++ /* ++ * OK, since we're going to drop the lock immediately ++ * afterwards anyway. ++ */ ++ update_rq_clock(rq); ++ rq = move_queued_task(rq, p, dest_cpu); ++ lock = &rq->lock; ++ } ++ ++out: ++ __task_access_unlock(p, lock); ++ raw_spin_unlock_irqrestore(&p->pi_lock, flags); ++ ++ return ret; ++} ++ ++int set_cpus_allowed_ptr(struct task_struct *p, const struct cpumask *new_mask) ++{ ++ return __set_cpus_allowed_ptr(p, new_mask, false); ++} ++EXPORT_SYMBOL_GPL(set_cpus_allowed_ptr); ++ ++#else ++static inline int ++__set_cpus_allowed_ptr(struct task_struct *p, ++ const struct cpumask *new_mask, bool check) ++{ ++ return set_cpus_allowed_ptr(p, new_mask); ++} ++#endif ++ ++static u64 task_init_deadline(const struct task_struct *p) ++{ ++ return task_rq(p)->clock + task_deadline_diff(p); ++} ++ ++u64 (* task_init_deadline_func_tbl[])(const struct task_struct *p) = { ++ task_init_deadline, /* SCHED_NORMAL */ ++ NULL, /* SCHED_FIFO */ ++ NULL, /* SCHED_RR */ ++ task_init_deadline, /* SCHED_BATCH */ ++ NULL, /* SCHED_ISO */ ++ task_init_deadline /* SCHED_IDLE */ ++}; ++ ++/* ++ * sched_setparam() passes in -1 for its policy, to let the functions ++ * it calls know not to change it. ++ */ ++#define SETPARAM_POLICY -1 ++ ++static void __setscheduler_params(struct task_struct *p, ++ const struct sched_attr *attr) ++{ ++ int old_policy = p->policy; ++ int policy = attr->sched_policy; ++ ++ if (policy == SETPARAM_POLICY) ++ policy = p->policy; ++ ++ p->policy = policy; ++ ++ /* ++ * allow normal nice value to be set, but will not have any ++ * effect on scheduling until the task not SCHED_NORMAL/ ++ * SCHED_BATCH ++ */ ++ p->static_prio = NICE_TO_PRIO(attr->sched_nice); ++ ++ /* ++ * __sched_setscheduler() ensures attr->sched_priority == 0 when ++ * !rt_policy. Always setting this ensures that things like ++ * getparam()/getattr() don't report silly values for !rt tasks. ++ */ ++ p->rt_priority = attr->sched_priority; ++ p->normal_prio = normal_prio(p); ++ ++ if (old_policy != policy) ++ p->deadline = (task_init_deadline_func_tbl[p->policy])? ++ task_init_deadline_func_tbl[p->policy](p):0ULL; ++} ++ ++/* Actually do priority change: must hold rq lock. */ ++static void __setscheduler(struct rq *rq, struct task_struct *p, ++ const struct sched_attr *attr, bool keep_boost) ++{ ++ __setscheduler_params(p, attr); ++ ++ /* ++ * Keep a potential priority boosting if called from ++ * sched_setscheduler(). ++ */ ++ p->prio = normal_prio(p); ++ if (keep_boost) ++ p->prio = rt_effective_prio(p, p->prio); ++ update_task_priodl(p); ++} ++ ++/* ++ * check the target process has a UID that matches the current process's ++ */ ++static bool check_same_owner(struct task_struct *p) ++{ ++ const struct cred *cred = current_cred(), *pcred; ++ bool match; ++ ++ rcu_read_lock(); ++ pcred = __task_cred(p); ++ match = (uid_eq(cred->euid, pcred->euid) || ++ uid_eq(cred->euid, pcred->uid)); ++ rcu_read_unlock(); ++ return match; ++} ++ ++static int ++__sched_setscheduler(struct task_struct *p, ++ const struct sched_attr *attr, bool user, bool pi) ++{ ++ const struct sched_attr dl_squash_attr = { ++ .size = sizeof(struct sched_attr), ++ .sched_policy = SCHED_FIFO, ++ .sched_nice = 0, ++ .sched_priority = 99, ++ }; ++ int newprio = MAX_RT_PRIO - 1 - attr->sched_priority; ++ int retval, oldpolicy = -1; ++ int policy = attr->sched_policy; ++ unsigned long flags; ++ struct rq *rq; ++ int reset_on_fork; ++ raw_spinlock_t *lock; ++ ++ /* The pi code expects interrupts enabled */ ++ BUG_ON(pi && in_interrupt()); ++ ++ /* ++ * PDS supports SCHED_DEADLINE by squash it as prio 0 SCHED_FIFO ++ */ ++ if (unlikely(SCHED_DEADLINE == policy)) { ++ attr = &dl_squash_attr; ++ policy = attr->sched_policy; ++ newprio = MAX_RT_PRIO - 1 - attr->sched_priority; ++ } ++recheck: ++ /* Double check policy once rq lock held */ ++ if (policy < 0) { ++ reset_on_fork = p->sched_reset_on_fork; ++ policy = oldpolicy = p->policy; ++ } else { ++ reset_on_fork = !!(attr->sched_flags & SCHED_RESET_ON_FORK); ++ ++ if (policy > SCHED_IDLE) ++ return -EINVAL; ++ } ++ ++ if (attr->sched_flags & ~(SCHED_FLAG_ALL)) ++ return -EINVAL; ++ ++ /* ++ * Valid priorities for SCHED_FIFO and SCHED_RR are ++ * 1..MAX_USER_RT_PRIO-1, valid priority for SCHED_NORMAL and ++ * SCHED_BATCH and SCHED_IDLE is 0. ++ */ ++ if (attr->sched_priority < 0 || ++ (p->mm && attr->sched_priority > MAX_USER_RT_PRIO - 1) || ++ (!p->mm && attr->sched_priority > MAX_RT_PRIO - 1)) ++ return -EINVAL; ++ if ((SCHED_RR == policy || SCHED_FIFO == policy) != ++ (attr->sched_priority != 0)) ++ return -EINVAL; ++ ++ /* ++ * Allow unprivileged RT tasks to decrease priority: ++ */ ++ if (user && !capable(CAP_SYS_NICE)) { ++ if (SCHED_FIFO == policy || SCHED_RR == policy) { ++ unsigned long rlim_rtprio = ++ task_rlimit(p, RLIMIT_RTPRIO); ++ ++ /* Can't set/change the rt policy */ ++ if (policy != p->policy && !rlim_rtprio) ++ return -EPERM; ++ ++ /* Can't increase priority */ ++ if (attr->sched_priority > p->rt_priority && ++ attr->sched_priority > rlim_rtprio) ++ return -EPERM; ++ } ++ ++ /* Can't change other user's priorities */ ++ if (!check_same_owner(p)) ++ return -EPERM; ++ ++ /* Normal users shall not reset the sched_reset_on_fork flag */ ++ if (p->sched_reset_on_fork && !reset_on_fork) ++ return -EPERM; ++ } ++ ++ if (user) { ++ retval = security_task_setscheduler(p); ++ if (retval) ++ return retval; ++ } ++ ++ if (pi) ++ cpuset_read_lock(); ++ ++ /* ++ * Make sure no PI-waiters arrive (or leave) while we are ++ * changing the priority of the task: ++ */ ++ raw_spin_lock_irqsave(&p->pi_lock, flags); ++ ++ /* ++ * To be able to change p->policy safely, task_access_lock() ++ * must be called. ++ * IF use task_access_lock() here: ++ * For the task p which is not running, reading rq->stop is ++ * racy but acceptable as ->stop doesn't change much. ++ * An enhancemnet can be made to read rq->stop saftly. ++ */ ++ rq = __task_access_lock(p, &lock); ++ ++ /* ++ * Changing the policy of the stop threads its a very bad idea ++ */ ++ if (p == rq->stop) { ++ retval = -EINVAL; ++ goto unlock; ++ } ++ ++ /* ++ * If not changing anything there's no need to proceed further: ++ */ ++ if (unlikely(policy == p->policy)) { ++ if (rt_policy(policy) && attr->sched_priority != p->rt_priority) ++ goto change; ++ if (!rt_policy(policy) && ++ NICE_TO_PRIO(attr->sched_nice) != p->static_prio) ++ goto change; ++ ++ p->sched_reset_on_fork = reset_on_fork; ++ retval = 0; ++ goto unlock; ++ } ++change: ++ ++ /* Re-check policy now with rq lock held */ ++ if (unlikely(oldpolicy != -1 && oldpolicy != p->policy)) { ++ policy = oldpolicy = -1; ++ __task_access_unlock(p, lock); ++ raw_spin_unlock_irqrestore(&p->pi_lock, flags); ++ if (pi) ++ cpuset_read_unlock(); ++ goto recheck; ++ } ++ ++ p->sched_reset_on_fork = reset_on_fork; ++ ++ if (pi) { ++ /* ++ * Take priority boosted tasks into account. If the new ++ * effective priority is unchanged, we just store the new ++ * normal parameters and do not touch the scheduler class and ++ * the runqueue. This will be done when the task deboost ++ * itself. ++ */ ++ if (rt_effective_prio(p, newprio) == p->prio) { ++ __setscheduler_params(p, attr); ++ retval = 0; ++ goto unlock; ++ } ++ } ++ ++ __setscheduler(rq, p, attr, pi); ++ ++ check_task_changed(rq, p); ++ ++ /* Avoid rq from going away on us: */ ++ preempt_disable(); ++ __task_access_unlock(p, lock); ++ raw_spin_unlock_irqrestore(&p->pi_lock, flags); ++ ++ if (pi) { ++ cpuset_read_unlock(); ++ rt_mutex_adjust_pi(p); ++ } ++ ++ preempt_enable(); ++ ++ return 0; ++ ++unlock: ++ __task_access_unlock(p, lock); ++ raw_spin_unlock_irqrestore(&p->pi_lock, flags); ++ if (pi) ++ cpuset_read_unlock(); ++ return retval; ++} ++ ++static int _sched_setscheduler(struct task_struct *p, int policy, ++ const struct sched_param *param, bool check) ++{ ++ struct sched_attr attr = { ++ .sched_policy = policy, ++ .sched_priority = param->sched_priority, ++ .sched_nice = PRIO_TO_NICE(p->static_prio), ++ }; ++ ++ /* Fixup the legacy SCHED_RESET_ON_FORK hack. */ ++ if ((policy != SETPARAM_POLICY) && (policy & SCHED_RESET_ON_FORK)) { ++ attr.sched_flags |= SCHED_FLAG_RESET_ON_FORK; ++ policy &= ~SCHED_RESET_ON_FORK; ++ attr.sched_policy = policy; ++ } ++ ++ return __sched_setscheduler(p, &attr, check, true); ++} ++ ++/** ++ * sched_setscheduler - change the scheduling policy and/or RT priority of a thread. ++ * @p: the task in question. ++ * @policy: new policy. ++ * @param: structure containing the new RT priority. ++ * ++ * Return: 0 on success. An error code otherwise. ++ * ++ * NOTE that the task may be already dead. ++ */ ++int sched_setscheduler(struct task_struct *p, int policy, ++ const struct sched_param *param) ++{ ++ return _sched_setscheduler(p, policy, param, true); ++} ++ ++EXPORT_SYMBOL_GPL(sched_setscheduler); ++ ++int sched_setattr(struct task_struct *p, const struct sched_attr *attr) ++{ ++ return __sched_setscheduler(p, attr, true, true); ++} ++EXPORT_SYMBOL_GPL(sched_setattr); ++ ++int sched_setattr_nocheck(struct task_struct *p, const struct sched_attr *attr) ++{ ++ return __sched_setscheduler(p, attr, false, true); ++} ++ ++/** ++ * sched_setscheduler_nocheck - change the scheduling policy and/or RT priority of a thread from kernelspace. ++ * @p: the task in question. ++ * @policy: new policy. ++ * @param: structure containing the new RT priority. ++ * ++ * Just like sched_setscheduler, only don't bother checking if the ++ * current context has permission. For example, this is needed in ++ * stop_machine(): we create temporary high priority worker threads, ++ * but our caller might not have that capability. ++ * ++ * Return: 0 on success. An error code otherwise. ++ */ ++int sched_setscheduler_nocheck(struct task_struct *p, int policy, ++ const struct sched_param *param) ++{ ++ return _sched_setscheduler(p, policy, param, false); ++} ++EXPORT_SYMBOL_GPL(sched_setscheduler_nocheck); ++ ++static int ++do_sched_setscheduler(pid_t pid, int policy, struct sched_param __user *param) ++{ ++ struct sched_param lparam; ++ struct task_struct *p; ++ int retval; ++ ++ if (!param || pid < 0) ++ return -EINVAL; ++ if (copy_from_user(&lparam, param, sizeof(struct sched_param))) ++ return -EFAULT; ++ ++ rcu_read_lock(); ++ retval = -ESRCH; ++ p = find_process_by_pid(pid); ++ if (likely(p)) ++ get_task_struct(p); ++ rcu_read_unlock(); ++ ++ if (likely(p)) { ++ retval = sched_setscheduler(p, policy, &lparam); ++ put_task_struct(p); ++ } ++ ++ return retval; ++} ++ ++/* ++ * Mimics kernel/events/core.c perf_copy_attr(). ++ */ ++static int sched_copy_attr(struct sched_attr __user *uattr, struct sched_attr *attr) ++{ ++ u32 size; ++ int ret; ++ ++ /* Zero the full structure, so that a short copy will be nice: */ ++ memset(attr, 0, sizeof(*attr)); ++ ++ ret = get_user(size, &uattr->size); ++ if (ret) ++ return ret; ++ ++ /* ABI compatibility quirk: */ ++ if (!size) ++ size = SCHED_ATTR_SIZE_VER0; ++ ++ if (size < SCHED_ATTR_SIZE_VER0 || size > PAGE_SIZE) ++ goto err_size; ++ ++ ret = copy_struct_from_user(attr, sizeof(*attr), uattr, size); ++ if (ret) { ++ if (ret == -E2BIG) ++ goto err_size; ++ return ret; ++ } ++ ++ /* ++ * XXX: Do we want to be lenient like existing syscalls; or do we want ++ * to be strict and return an error on out-of-bounds values? ++ */ ++ attr->sched_nice = clamp(attr->sched_nice, -20, 19); ++ ++ /* sched/core.c uses zero here but we already know ret is zero */ ++ return 0; ++ ++err_size: ++ put_user(sizeof(*attr), &uattr->size); ++ return -E2BIG; ++} ++ ++/** ++ * sys_sched_setscheduler - set/change the scheduler policy and RT priority ++ * @pid: the pid in question. ++ * @policy: new policy. ++ * ++ * Return: 0 on success. An error code otherwise. ++ * @param: structure containing the new RT priority. ++ */ ++SYSCALL_DEFINE3(sched_setscheduler, pid_t, pid, int, policy, struct sched_param __user *, param) ++{ ++ if (policy < 0) ++ return -EINVAL; ++ ++ return do_sched_setscheduler(pid, policy, param); ++} ++ ++/** ++ * sys_sched_setparam - set/change the RT priority of a thread ++ * @pid: the pid in question. ++ * @param: structure containing the new RT priority. ++ * ++ * Return: 0 on success. An error code otherwise. ++ */ ++SYSCALL_DEFINE2(sched_setparam, pid_t, pid, struct sched_param __user *, param) ++{ ++ return do_sched_setscheduler(pid, SETPARAM_POLICY, param); ++} ++ ++/** ++ * sys_sched_setattr - same as above, but with extended sched_attr ++ * @pid: the pid in question. ++ * @uattr: structure containing the extended parameters. ++ */ ++SYSCALL_DEFINE3(sched_setattr, pid_t, pid, struct sched_attr __user *, uattr, ++ unsigned int, flags) ++{ ++ struct sched_attr attr; ++ struct task_struct *p; ++ int retval; ++ ++ if (!uattr || pid < 0 || flags) ++ return -EINVAL; ++ ++ retval = sched_copy_attr(uattr, &attr); ++ if (retval) ++ return retval; ++ ++ if ((int)attr.sched_policy < 0) ++ return -EINVAL; ++ ++ rcu_read_lock(); ++ retval = -ESRCH; ++ p = find_process_by_pid(pid); ++ if (p != NULL) ++ retval = sched_setattr(p, &attr); ++ rcu_read_unlock(); ++ ++ return retval; ++} ++ ++/** ++ * sys_sched_getscheduler - get the policy (scheduling class) of a thread ++ * @pid: the pid in question. ++ * ++ * Return: On success, the policy of the thread. Otherwise, a negative error ++ * code. ++ */ ++SYSCALL_DEFINE1(sched_getscheduler, pid_t, pid) ++{ ++ struct task_struct *p; ++ int retval = -EINVAL; ++ ++ if (pid < 0) ++ goto out_nounlock; ++ ++ retval = -ESRCH; ++ rcu_read_lock(); ++ p = find_process_by_pid(pid); ++ if (p) { ++ retval = security_task_getscheduler(p); ++ if (!retval) ++ retval = p->policy; ++ } ++ rcu_read_unlock(); ++ ++out_nounlock: ++ return retval; ++} ++ ++/** ++ * sys_sched_getscheduler - get the RT priority of a thread ++ * @pid: the pid in question. ++ * @param: structure containing the RT priority. ++ * ++ * Return: On success, 0 and the RT priority is in @param. Otherwise, an error ++ * code. ++ */ ++SYSCALL_DEFINE2(sched_getparam, pid_t, pid, struct sched_param __user *, param) ++{ ++ struct sched_param lp = { .sched_priority = 0 }; ++ struct task_struct *p; ++ int retval = -EINVAL; ++ ++ if (!param || pid < 0) ++ goto out_nounlock; ++ ++ rcu_read_lock(); ++ p = find_process_by_pid(pid); ++ retval = -ESRCH; ++ if (!p) ++ goto out_unlock; ++ ++ retval = security_task_getscheduler(p); ++ if (retval) ++ goto out_unlock; ++ ++ if (task_has_rt_policy(p)) ++ lp.sched_priority = p->rt_priority; ++ rcu_read_unlock(); ++ ++ /* ++ * This one might sleep, we cannot do it with a spinlock held ... ++ */ ++ retval = copy_to_user(param, &lp, sizeof(*param)) ? -EFAULT : 0; ++ ++out_nounlock: ++ return retval; ++ ++out_unlock: ++ rcu_read_unlock(); ++ return retval; ++} ++ ++/* ++ * Copy the kernel size attribute structure (which might be larger ++ * than what user-space knows about) to user-space. ++ * ++ * Note that all cases are valid: user-space buffer can be larger or ++ * smaller than the kernel-space buffer. The usual case is that both ++ * have the same size. ++ */ ++static int ++sched_attr_copy_to_user(struct sched_attr __user *uattr, ++ struct sched_attr *kattr, ++ unsigned int usize) ++{ ++ unsigned int ksize = sizeof(*kattr); ++ ++ if (!access_ok(uattr, usize)) ++ return -EFAULT; ++ ++ /* ++ * sched_getattr() ABI forwards and backwards compatibility: ++ * ++ * If usize == ksize then we just copy everything to user-space and all is good. ++ * ++ * If usize < ksize then we only copy as much as user-space has space for, ++ * this keeps ABI compatibility as well. We skip the rest. ++ * ++ * If usize > ksize then user-space is using a newer version of the ABI, ++ * which part the kernel doesn't know about. Just ignore it - tooling can ++ * detect the kernel's knowledge of attributes from the attr->size value ++ * which is set to ksize in this case. ++ */ ++ kattr->size = min(usize, ksize); ++ ++ if (copy_to_user(uattr, kattr, kattr->size)) ++ return -EFAULT; ++ ++ return 0; ++} ++ ++/** ++ * sys_sched_getattr - similar to sched_getparam, but with sched_attr ++ * @pid: the pid in question. ++ * @uattr: structure containing the extended parameters. ++ * @usize: sizeof(attr) for fwd/bwd comp. ++ * @flags: for future extension. ++ */ ++SYSCALL_DEFINE4(sched_getattr, pid_t, pid, struct sched_attr __user *, uattr, ++ unsigned int, usize, unsigned int, flags) ++{ ++ struct sched_attr kattr = { }; ++ struct task_struct *p; ++ int retval; ++ ++ if (!uattr || pid < 0 || usize > PAGE_SIZE || ++ usize < SCHED_ATTR_SIZE_VER0 || flags) ++ return -EINVAL; ++ ++ rcu_read_lock(); ++ p = find_process_by_pid(pid); ++ retval = -ESRCH; ++ if (!p) ++ goto out_unlock; ++ ++ retval = security_task_getscheduler(p); ++ if (retval) ++ goto out_unlock; ++ ++ kattr.sched_policy = p->policy; ++ if (rt_task(p)) ++ kattr.sched_priority = p->rt_priority; ++ else ++ kattr.sched_nice = task_nice(p); ++ ++#ifdef CONFIG_UCLAMP_TASK ++ kattr.sched_util_min = p->uclamp_req[UCLAMP_MIN].value; ++ kattr.sched_util_max = p->uclamp_req[UCLAMP_MAX].value; ++#endif ++ ++ rcu_read_unlock(); ++ ++ return sched_attr_copy_to_user(uattr, &kattr, usize); ++ ++out_unlock: ++ rcu_read_unlock(); ++ return retval; ++} ++ ++long sched_setaffinity(pid_t pid, const struct cpumask *in_mask) ++{ ++ cpumask_var_t cpus_mask, new_mask; ++ struct task_struct *p; ++ int retval; ++ ++ get_online_cpus(); ++ rcu_read_lock(); ++ ++ p = find_process_by_pid(pid); ++ if (!p) { ++ rcu_read_unlock(); ++ put_online_cpus(); ++ return -ESRCH; ++ } ++ ++ /* Prevent p going away */ ++ get_task_struct(p); ++ rcu_read_unlock(); ++ ++ if (p->flags & PF_NO_SETAFFINITY) { ++ retval = -EINVAL; ++ goto out_put_task; ++ } ++ if (!alloc_cpumask_var(&cpus_mask, GFP_KERNEL)) { ++ retval = -ENOMEM; ++ goto out_put_task; ++ } ++ if (!alloc_cpumask_var(&new_mask, GFP_KERNEL)) { ++ retval = -ENOMEM; ++ goto out_free_cpus_allowed; ++ } ++ retval = -EPERM; ++ if (!check_same_owner(p)) { ++ rcu_read_lock(); ++ if (!ns_capable(__task_cred(p)->user_ns, CAP_SYS_NICE)) { ++ rcu_read_unlock(); ++ goto out_unlock; ++ } ++ rcu_read_unlock(); ++ } ++ ++ retval = security_task_setscheduler(p); ++ if (retval) ++ goto out_unlock; ++ ++ cpuset_cpus_allowed(p, cpus_mask); ++ cpumask_and(new_mask, in_mask, cpus_mask); ++again: ++ retval = __set_cpus_allowed_ptr(p, new_mask, true); ++ ++ if (!retval) { ++ cpuset_cpus_allowed(p, cpus_mask); ++ if (!cpumask_subset(new_mask, cpus_mask)) { ++ /* ++ * We must have raced with a concurrent cpuset ++ * update. Just reset the cpus_mask to the ++ * cpuset's cpus_mask ++ */ ++ cpumask_copy(new_mask, cpus_mask); ++ goto again; ++ } ++ } ++out_unlock: ++ free_cpumask_var(new_mask); ++out_free_cpus_allowed: ++ free_cpumask_var(cpus_mask); ++out_put_task: ++ put_task_struct(p); ++ put_online_cpus(); ++ return retval; ++} ++ ++static int get_user_cpu_mask(unsigned long __user *user_mask_ptr, unsigned len, ++ struct cpumask *new_mask) ++{ ++ if (len < cpumask_size()) ++ cpumask_clear(new_mask); ++ else if (len > cpumask_size()) ++ len = cpumask_size(); ++ ++ return copy_from_user(new_mask, user_mask_ptr, len) ? -EFAULT : 0; ++} ++ ++/** ++ * sys_sched_setaffinity - set the CPU affinity of a process ++ * @pid: pid of the process ++ * @len: length in bytes of the bitmask pointed to by user_mask_ptr ++ * @user_mask_ptr: user-space pointer to the new CPU mask ++ * ++ * Return: 0 on success. An error code otherwise. ++ */ ++SYSCALL_DEFINE3(sched_setaffinity, pid_t, pid, unsigned int, len, ++ unsigned long __user *, user_mask_ptr) ++{ ++ cpumask_var_t new_mask; ++ int retval; ++ ++ if (!alloc_cpumask_var(&new_mask, GFP_KERNEL)) ++ return -ENOMEM; ++ ++ retval = get_user_cpu_mask(user_mask_ptr, len, new_mask); ++ if (retval == 0) ++ retval = sched_setaffinity(pid, new_mask); ++ free_cpumask_var(new_mask); ++ return retval; ++} ++ ++long sched_getaffinity(pid_t pid, cpumask_t *mask) ++{ ++ struct task_struct *p; ++ raw_spinlock_t *lock; ++ unsigned long flags; ++ int retval; ++ ++ rcu_read_lock(); ++ ++ retval = -ESRCH; ++ p = find_process_by_pid(pid); ++ if (!p) ++ goto out_unlock; ++ ++ retval = security_task_getscheduler(p); ++ if (retval) ++ goto out_unlock; ++ ++ task_access_lock_irqsave(p, &lock, &flags); ++ cpumask_and(mask, &p->cpus_mask, cpu_active_mask); ++ task_access_unlock_irqrestore(p, lock, &flags); ++ ++out_unlock: ++ rcu_read_unlock(); ++ ++ return retval; ++} ++ ++/** ++ * sys_sched_getaffinity - get the CPU affinity of a process ++ * @pid: pid of the process ++ * @len: length in bytes of the bitmask pointed to by user_mask_ptr ++ * @user_mask_ptr: user-space pointer to hold the current CPU mask ++ * ++ * Return: size of CPU mask copied to user_mask_ptr on success. An ++ * error code otherwise. ++ */ ++SYSCALL_DEFINE3(sched_getaffinity, pid_t, pid, unsigned int, len, ++ unsigned long __user *, user_mask_ptr) ++{ ++ int ret; ++ cpumask_var_t mask; ++ ++ if ((len * BITS_PER_BYTE) < nr_cpu_ids) ++ return -EINVAL; ++ if (len & (sizeof(unsigned long)-1)) ++ return -EINVAL; ++ ++ if (!alloc_cpumask_var(&mask, GFP_KERNEL)) ++ return -ENOMEM; ++ ++ ret = sched_getaffinity(pid, mask); ++ if (ret == 0) { ++ unsigned int retlen = min_t(size_t, len, cpumask_size()); ++ ++ if (copy_to_user(user_mask_ptr, mask, retlen)) ++ ret = -EFAULT; ++ else ++ ret = retlen; ++ } ++ free_cpumask_var(mask); ++ ++ return ret; ++} ++ ++/** ++ * sys_sched_yield - yield the current processor to other threads. ++ * ++ * This function yields the current CPU to other tasks. It does this by ++ * scheduling away the current task. If it still has the earliest deadline ++ * it will be scheduled again as the next task. ++ * ++ * Return: 0. ++ */ ++static void do_sched_yield(void) ++{ ++ struct rq *rq; ++ struct rq_flags rf; ++ ++ if (!sched_yield_type) ++ return; ++ ++ rq = this_rq_lock_irq(&rf); ++ ++ if (sched_yield_type > 1) { ++ time_slice_expired(current, rq); ++ requeue_task(current, rq); ++ } ++ schedstat_inc(rq->yld_count); ++ ++ /* ++ * Since we are going to call schedule() anyway, there's ++ * no need to preempt or enable interrupts: ++ */ ++ preempt_disable(); ++ raw_spin_unlock(&rq->lock); ++ sched_preempt_enable_no_resched(); ++ ++ schedule(); ++} ++ ++SYSCALL_DEFINE0(sched_yield) ++{ ++ do_sched_yield(); ++ return 0; ++} ++ ++#ifndef CONFIG_PREEMPTION ++int __sched _cond_resched(void) ++{ ++ if (should_resched(0)) { ++ preempt_schedule_common(); ++ return 1; ++ } ++ rcu_all_qs(); ++ return 0; ++} ++EXPORT_SYMBOL(_cond_resched); ++#endif ++ ++/* ++ * __cond_resched_lock() - if a reschedule is pending, drop the given lock, ++ * call schedule, and on return reacquire the lock. ++ * ++ * This works OK both with and without CONFIG_PREEMPTION. We do strange low-level ++ * operations here to prevent schedule() from being called twice (once via ++ * spin_unlock(), once by hand). ++ */ ++int __cond_resched_lock(spinlock_t *lock) ++{ ++ int resched = should_resched(PREEMPT_LOCK_OFFSET); ++ int ret = 0; ++ ++ lockdep_assert_held(lock); ++ ++ if (spin_needbreak(lock) || resched) { ++ spin_unlock(lock); ++ if (resched) ++ preempt_schedule_common(); ++ else ++ cpu_relax(); ++ ret = 1; ++ spin_lock(lock); ++ } ++ return ret; ++} ++EXPORT_SYMBOL(__cond_resched_lock); ++ ++/** ++ * yield - yield the current processor to other threads. ++ * ++ * Do not ever use this function, there's a 99% chance you're doing it wrong. ++ * ++ * The scheduler is at all times free to pick the calling task as the most ++ * eligible task to run, if removing the yield() call from your code breaks ++ * it, its already broken. ++ * ++ * Typical broken usage is: ++ * ++ * while (!event) ++ * yield(); ++ * ++ * where one assumes that yield() will let 'the other' process run that will ++ * make event true. If the current task is a SCHED_FIFO task that will never ++ * happen. Never use yield() as a progress guarantee!! ++ * ++ * If you want to use yield() to wait for something, use wait_event(). ++ * If you want to use yield() to be 'nice' for others, use cond_resched(). ++ * If you still want to use yield(), do not! ++ */ ++void __sched yield(void) ++{ ++ set_current_state(TASK_RUNNING); ++ do_sched_yield(); ++} ++EXPORT_SYMBOL(yield); ++ ++/** ++ * yield_to - yield the current processor to another thread in ++ * your thread group, or accelerate that thread toward the ++ * processor it's on. ++ * @p: target task ++ * @preempt: whether task preemption is allowed or not ++ * ++ * It's the caller's job to ensure that the target task struct ++ * can't go away on us before we can do any checks. ++ * ++ * In PDS, yield_to is not supported. ++ * ++ * Return: ++ * true (>0) if we indeed boosted the target task. ++ * false (0) if we failed to boost the target. ++ * -ESRCH if there's no task to yield to. ++ */ ++int __sched yield_to(struct task_struct *p, bool preempt) ++{ ++ return 0; ++} ++EXPORT_SYMBOL_GPL(yield_to); ++ ++int io_schedule_prepare(void) ++{ ++ int old_iowait = current->in_iowait; ++ ++ current->in_iowait = 1; ++ blk_schedule_flush_plug(current); ++ ++ return old_iowait; ++} ++ ++void io_schedule_finish(int token) ++{ ++ current->in_iowait = token; ++} ++ ++/* ++ * This task is about to go to sleep on IO. Increment rq->nr_iowait so ++ * that process accounting knows that this is a task in IO wait state. ++ * ++ * But don't do that if it is a deliberate, throttling IO wait (this task ++ * has set its backing_dev_info: the queue against which it should throttle) ++ */ ++ ++long __sched io_schedule_timeout(long timeout) ++{ ++ int token; ++ long ret; ++ ++ token = io_schedule_prepare(); ++ ret = schedule_timeout(timeout); ++ io_schedule_finish(token); ++ ++ return ret; ++} ++EXPORT_SYMBOL(io_schedule_timeout); ++ ++void io_schedule(void) ++{ ++ int token; ++ ++ token = io_schedule_prepare(); ++ schedule(); ++ io_schedule_finish(token); ++} ++EXPORT_SYMBOL(io_schedule); ++ ++/** ++ * sys_sched_get_priority_max - return maximum RT priority. ++ * @policy: scheduling class. ++ * ++ * Return: On success, this syscall returns the maximum ++ * rt_priority that can be used by a given scheduling class. ++ * On failure, a negative error code is returned. ++ */ ++SYSCALL_DEFINE1(sched_get_priority_max, int, policy) ++{ ++ int ret = -EINVAL; ++ ++ switch (policy) { ++ case SCHED_FIFO: ++ case SCHED_RR: ++ ret = MAX_USER_RT_PRIO-1; ++ break; ++ case SCHED_NORMAL: ++ case SCHED_BATCH: ++ case SCHED_ISO: ++ case SCHED_IDLE: ++ ret = 0; ++ break; ++ } ++ return ret; ++} ++ ++/** ++ * sys_sched_get_priority_min - return minimum RT priority. ++ * @policy: scheduling class. ++ * ++ * Return: On success, this syscall returns the minimum ++ * rt_priority that can be used by a given scheduling class. ++ * On failure, a negative error code is returned. ++ */ ++SYSCALL_DEFINE1(sched_get_priority_min, int, policy) ++{ ++ int ret = -EINVAL; ++ ++ switch (policy) { ++ case SCHED_FIFO: ++ case SCHED_RR: ++ ret = 1; ++ break; ++ case SCHED_NORMAL: ++ case SCHED_BATCH: ++ case SCHED_ISO: ++ case SCHED_IDLE: ++ ret = 0; ++ break; ++ } ++ return ret; ++} ++ ++static int sched_rr_get_interval(pid_t pid, struct timespec64 *t) ++{ ++ struct task_struct *p; ++ int retval; ++ ++ if (pid < 0) ++ return -EINVAL; ++ ++ retval = -ESRCH; ++ rcu_read_lock(); ++ p = find_process_by_pid(pid); ++ if (!p) ++ goto out_unlock; ++ ++ retval = security_task_getscheduler(p); ++ if (retval) ++ goto out_unlock; ++ rcu_read_unlock(); ++ ++ *t = ns_to_timespec64(MS_TO_NS(rr_interval)); ++ return 0; ++ ++out_unlock: ++ rcu_read_unlock(); ++ return retval; ++} ++ ++/** ++ * sys_sched_rr_get_interval - return the default timeslice of a process. ++ * @pid: pid of the process. ++ * @interval: userspace pointer to the timeslice value. ++ * ++ * ++ * Return: On success, 0 and the timeslice is in @interval. Otherwise, ++ * an error code. ++ */ ++SYSCALL_DEFINE2(sched_rr_get_interval, pid_t, pid, ++ struct __kernel_timespec __user *, interval) ++{ ++ struct timespec64 t; ++ int retval = sched_rr_get_interval(pid, &t); ++ ++ if (retval == 0) ++ retval = put_timespec64(&t, interval); ++ ++ return retval; ++} ++ ++#ifdef CONFIG_COMPAT_32BIT_TIME ++SYSCALL_DEFINE2(sched_rr_get_interval_time32, pid_t, pid, ++ struct old_timespec32 __user *, interval) ++{ ++ struct timespec64 t; ++ int retval = sched_rr_get_interval(pid, &t); ++ ++ if (retval == 0) ++ retval = put_old_timespec32(&t, interval); ++ return retval; ++} ++#endif ++ ++void sched_show_task(struct task_struct *p) ++{ ++ unsigned long free = 0; ++ int ppid; ++ ++ if (!try_get_task_stack(p)) ++ return; ++ ++ printk(KERN_INFO "%-15.15s %c", p->comm, task_state_to_char(p)); ++ ++ if (p->state == TASK_RUNNING) ++ printk(KERN_CONT " running task "); ++#ifdef CONFIG_DEBUG_STACK_USAGE ++ free = stack_not_used(p); ++#endif ++ ppid = 0; ++ rcu_read_lock(); ++ if (pid_alive(p)) ++ ppid = task_pid_nr(rcu_dereference(p->real_parent)); ++ rcu_read_unlock(); ++ printk(KERN_CONT "%5lu %5d %6d 0x%08lx\n", free, ++ task_pid_nr(p), ppid, ++ (unsigned long)task_thread_info(p)->flags); ++ ++ print_worker_info(KERN_INFO, p); ++ show_stack(p, NULL); ++ put_task_stack(p); ++} ++EXPORT_SYMBOL_GPL(sched_show_task); ++ ++static inline bool ++state_filter_match(unsigned long state_filter, struct task_struct *p) ++{ ++ /* no filter, everything matches */ ++ if (!state_filter) ++ return true; ++ ++ /* filter, but doesn't match */ ++ if (!(p->state & state_filter)) ++ return false; ++ ++ /* ++ * When looking for TASK_UNINTERRUPTIBLE skip TASK_IDLE (allows ++ * TASK_KILLABLE). ++ */ ++ if (state_filter == TASK_UNINTERRUPTIBLE && p->state == TASK_IDLE) ++ return false; ++ ++ return true; ++} ++ ++ ++void show_state_filter(unsigned long state_filter) ++{ ++ struct task_struct *g, *p; ++ ++#if BITS_PER_LONG == 32 ++ printk(KERN_INFO ++ " task PC stack pid father\n"); ++#else ++ printk(KERN_INFO ++ " task PC stack pid father\n"); ++#endif ++ rcu_read_lock(); ++ for_each_process_thread(g, p) { ++ /* ++ * reset the NMI-timeout, listing all files on a slow ++ * console might take a lot of time: ++ * Also, reset softlockup watchdogs on all CPUs, because ++ * another CPU might be blocked waiting for us to process ++ * an IPI. ++ */ ++ touch_nmi_watchdog(); ++ touch_all_softlockup_watchdogs(); ++ if (state_filter_match(state_filter, p)) ++ sched_show_task(p); ++ } ++ ++#ifdef CONFIG_SCHED_DEBUG ++ /* PDS TODO: should support this ++ if (!state_filter) ++ sysrq_sched_debug_show(); ++ */ ++#endif ++ rcu_read_unlock(); ++ /* ++ * Only show locks if all tasks are dumped: ++ */ ++ if (!state_filter) ++ debug_show_all_locks(); ++} ++ ++void dump_cpu_task(int cpu) ++{ ++ pr_info("Task dump for CPU %d:\n", cpu); ++ sched_show_task(cpu_curr(cpu)); ++} ++ ++/** ++ * init_idle - set up an idle thread for a given CPU ++ * @idle: task in question ++ * @cpu: cpu the idle task belongs to ++ * ++ * NOTE: this function does not set the idle thread's NEED_RESCHED ++ * flag, to make booting more robust. ++ */ ++void init_idle(struct task_struct *idle, int cpu) ++{ ++ struct rq *rq = cpu_rq(cpu); ++ unsigned long flags; ++ ++ raw_spin_lock_irqsave(&idle->pi_lock, flags); ++ raw_spin_lock(&rq->lock); ++ update_rq_clock(rq); ++ ++ idle->last_ran = rq->clock_task; ++ idle->state = TASK_RUNNING; ++ idle->flags |= PF_IDLE; ++ /* Setting prio to illegal value shouldn't matter when never queued */ ++ idle->prio = PRIO_LIMIT; ++ idle->deadline = rq_clock(rq) + task_deadline_diff(idle); ++ update_task_priodl(idle); ++ ++ kasan_unpoison_task_stack(idle); ++ ++#ifdef CONFIG_SMP ++ /* ++ * It's possible that init_idle() gets called multiple times on a task, ++ * in that case do_set_cpus_allowed() will not do the right thing. ++ * ++ * And since this is boot we can forgo the serialisation. ++ */ ++ set_cpus_allowed_common(idle, cpumask_of(cpu)); ++#endif ++ ++ /* Silence PROVE_RCU */ ++ rcu_read_lock(); ++ __set_task_cpu(idle, cpu); ++ rcu_read_unlock(); ++ ++ rq->idle = idle; ++ rcu_assign_pointer(rq->curr, idle); ++ idle->on_cpu = 1; ++ ++ raw_spin_unlock(&rq->lock); ++ raw_spin_unlock_irqrestore(&idle->pi_lock, flags); ++ ++ /* Set the preempt count _outside_ the spinlocks! */ ++ init_idle_preempt_count(idle, cpu); ++ ++ ftrace_graph_init_idle_task(idle, cpu); ++ vtime_init_idle(idle, cpu); ++#ifdef CONFIG_SMP ++ sprintf(idle->comm, "%s/%d", INIT_TASK_COMM, cpu); ++#endif ++} ++ ++void resched_cpu(int cpu) ++{ ++ struct rq *rq = cpu_rq(cpu); ++ unsigned long flags; ++ ++ raw_spin_lock_irqsave(&rq->lock, flags); ++ if (cpu_online(cpu) || cpu == smp_processor_id()) ++ resched_curr(cpu_rq(cpu)); ++ raw_spin_unlock_irqrestore(&rq->lock, flags); ++} ++ ++static bool __wake_q_add(struct wake_q_head *head, struct task_struct *task) ++{ ++ struct wake_q_node *node = &task->wake_q; ++ ++ /* ++ * Atomically grab the task, if ->wake_q is !nil already it means ++ * its already queued (either by us or someone else) and will get the ++ * wakeup due to that. ++ * ++ * In order to ensure that a pending wakeup will observe our pending ++ * state, even in the failed case, an explicit smp_mb() must be used. ++ */ ++ smp_mb__before_atomic(); ++ if (unlikely(cmpxchg_relaxed(&node->next, NULL, WAKE_Q_TAIL))) ++ return false; ++ ++ /* ++ * The head is context local, there can be no concurrency. ++ */ ++ *head->lastp = node; ++ head->lastp = &node->next; ++ return true; ++} ++ ++/** ++ * wake_q_add() - queue a wakeup for 'later' waking. ++ * @head: the wake_q_head to add @task to ++ * @task: the task to queue for 'later' wakeup ++ * ++ * Queue a task for later wakeup, most likely by the wake_up_q() call in the ++ * same context, _HOWEVER_ this is not guaranteed, the wakeup can come ++ * instantly. ++ * ++ * This function must be used as-if it were wake_up_process(); IOW the task ++ * must be ready to be woken at this location. ++ */ ++void wake_q_add(struct wake_q_head *head, struct task_struct *task) ++{ ++ if (__wake_q_add(head, task)) ++ get_task_struct(task); ++} ++ ++/** ++ * wake_q_add_safe() - safely queue a wakeup for 'later' waking. ++ * @head: the wake_q_head to add @task to ++ * @task: the task to queue for 'later' wakeup ++ * ++ * Queue a task for later wakeup, most likely by the wake_up_q() call in the ++ * same context, _HOWEVER_ this is not guaranteed, the wakeup can come ++ * instantly. ++ * ++ * This function must be used as-if it were wake_up_process(); IOW the task ++ * must be ready to be woken at this location. ++ * ++ * This function is essentially a task-safe equivalent to wake_q_add(). Callers ++ * that already hold reference to @task can call the 'safe' version and trust ++ * wake_q to do the right thing depending whether or not the @task is already ++ * queued for wakeup. ++ */ ++void wake_q_add_safe(struct wake_q_head *head, struct task_struct *task) ++{ ++ if (!__wake_q_add(head, task)) ++ put_task_struct(task); ++} ++ ++void wake_up_q(struct wake_q_head *head) ++{ ++ struct wake_q_node *node = head->first; ++ ++ while (node != WAKE_Q_TAIL) { ++ struct task_struct *task; ++ ++ task = container_of(node, struct task_struct, wake_q); ++ BUG_ON(!task); ++ /* task can safely be re-inserted now: */ ++ node = node->next; ++ task->wake_q.next = NULL; ++ ++ /* ++ * wake_up_process() executes a full barrier, which pairs with ++ * the queueing in wake_q_add() so as not to miss wakeups. ++ */ ++ wake_up_process(task); ++ put_task_struct(task); ++ } ++} ++ ++#ifdef CONFIG_SMP ++ ++int cpuset_cpumask_can_shrink(const struct cpumask __maybe_unused *cur, ++ const struct cpumask __maybe_unused *trial) ++{ ++ return 1; ++} ++ ++int task_can_attach(struct task_struct *p, ++ const struct cpumask *cs_cpus_allowed) ++{ ++ int ret = 0; ++ ++ /* ++ * Kthreads which disallow setaffinity shouldn't be moved ++ * to a new cpuset; we don't want to change their CPU ++ * affinity and isolating such threads by their set of ++ * allowed nodes is unnecessary. Thus, cpusets are not ++ * applicable for such threads. This prevents checking for ++ * success of set_cpus_allowed_ptr() on all attached tasks ++ * before cpus_mask may be changed. ++ */ ++ if (p->flags & PF_NO_SETAFFINITY) ++ ret = -EINVAL; ++ ++ return ret; ++} ++ ++static bool sched_smp_initialized __read_mostly; ++ ++#ifdef CONFIG_NO_HZ_COMMON ++void nohz_balance_enter_idle(int cpu) ++{ ++} ++ ++void select_nohz_load_balancer(int stop_tick) ++{ ++} ++ ++void set_cpu_sd_state_idle(void) {} ++ ++/* ++ * In the semi idle case, use the nearest busy CPU for migrating timers ++ * from an idle CPU. This is good for power-savings. ++ * ++ * We don't do similar optimization for completely idle system, as ++ * selecting an idle CPU will add more delays to the timers than intended ++ * (as that CPU's timer base may not be uptodate wrt jiffies etc). ++ */ ++int get_nohz_timer_target(void) ++{ ++ int i, cpu = smp_processor_id(); ++ struct cpumask *mask; ++ ++ if (!idle_cpu(cpu) && housekeeping_cpu(cpu, HK_FLAG_TIMER)) ++ return cpu; ++ ++ for (mask = &(per_cpu(sched_cpu_affinity_chk_masks, cpu)[0]); ++ mask < per_cpu(sched_cpu_affinity_chk_end_masks, cpu); mask++) ++ for_each_cpu(i, mask) ++ if (!idle_cpu(i) && housekeeping_cpu(i, HK_FLAG_TIMER)) ++ return i; ++ ++ if (!housekeeping_cpu(cpu, HK_FLAG_TIMER)) ++ cpu = housekeeping_any_cpu(HK_FLAG_TIMER); ++ ++ return cpu; ++} ++ ++/* ++ * When add_timer_on() enqueues a timer into the timer wheel of an ++ * idle CPU then this timer might expire before the next timer event ++ * which is scheduled to wake up that CPU. In case of a completely ++ * idle system the next event might even be infinite time into the ++ * future. wake_up_idle_cpu() ensures that the CPU is woken up and ++ * leaves the inner idle loop so the newly added timer is taken into ++ * account when the CPU goes back to idle and evaluates the timer ++ * wheel for the next timer event. ++ */ ++void wake_up_idle_cpu(int cpu) ++{ ++ if (cpu == smp_processor_id()) ++ return; ++ ++ set_tsk_need_resched(cpu_rq(cpu)->idle); ++ smp_send_reschedule(cpu); ++} ++ ++void wake_up_nohz_cpu(int cpu) ++{ ++ wake_up_idle_cpu(cpu); ++} ++#endif /* CONFIG_NO_HZ_COMMON */ ++ ++#ifdef CONFIG_HOTPLUG_CPU ++/* ++ * Ensures that the idle task is using init_mm right before its CPU goes ++ * offline. ++ */ ++void idle_task_exit(void) ++{ ++ struct mm_struct *mm = current->active_mm; ++ ++ BUG_ON(cpu_online(smp_processor_id())); ++ ++ if (mm != &init_mm) { ++ switch_mm(mm, &init_mm, current); ++ current->active_mm = &init_mm; ++ finish_arch_post_lock_switch(); ++ } ++ mmdrop(mm); ++} ++ ++/* ++ * Migrate all tasks from the rq, sleeping tasks will be migrated by ++ * try_to_wake_up()->select_task_rq(). ++ * ++ * Called with rq->lock held even though we'er in stop_machine() and ++ * there's no concurrency possible, we hold the required locks anyway ++ * because of lock validation efforts. ++ */ ++static void migrate_tasks(struct rq *dead_rq) ++{ ++ struct rq *rq = dead_rq; ++ struct task_struct *p, *stop = rq->stop; ++ struct skiplist_node *node; ++ int count = 0; ++ ++ /* ++ * Fudge the rq selection such that the below task selection loop ++ * doesn't get stuck on the currently eligible stop task. ++ * ++ * We're currently inside stop_machine() and the rq is either stuck ++ * in the stop_machine_cpu_stop() loop, or we're executing this code, ++ * either way we should never end up calling schedule() until we're ++ * done here. ++ */ ++ rq->stop = NULL; ++ ++ node = &rq->sl_header; ++ while ((node = node->next[0]) != &rq->sl_header) { ++ int dest_cpu; ++ ++ p = skiplist_entry(node, struct task_struct, sl_node); ++ ++ /* skip the running task */ ++ if (task_running(p)) ++ continue; ++ ++ /* ++ * Rules for changing task_struct::cpus_mask are holding ++ * both pi_lock and rq->lock, such that holding either ++ * stabilizes the mask. ++ * ++ * Drop rq->lock is not quite as disastrous as it usually is ++ * because !cpu_active at this point, which means load-balance ++ * will not interfere. Also, stop-machine. ++ */ ++ raw_spin_unlock(&rq->lock); ++ raw_spin_lock(&p->pi_lock); ++ raw_spin_lock(&rq->lock); ++ ++ /* ++ * Since we're inside stop-machine, _nothing_ should have ++ * changed the task, WARN if weird stuff happened, because in ++ * that case the above rq->lock drop is a fail too. ++ */ ++ if (WARN_ON(task_rq(p) != rq || !task_on_rq_queued(p))) { ++ raw_spin_unlock(&p->pi_lock); ++ continue; ++ } ++ ++ count++; ++ /* Find suitable destination for @next, with force if needed. */ ++ dest_cpu = select_fallback_rq(dead_rq->cpu, p); ++ ++ rq = __migrate_task(rq, p, dest_cpu); ++ raw_spin_unlock(&rq->lock); ++ raw_spin_unlock(&p->pi_lock); ++ ++ rq = dead_rq; ++ raw_spin_lock(&rq->lock); ++ /* Check queued task all over from the header again */ ++ node = &rq->sl_header; ++ } ++ ++ rq->stop = stop; ++} ++ ++static void set_rq_offline(struct rq *rq) ++{ ++ if (rq->online) ++ rq->online = false; ++} ++#endif /* CONFIG_HOTPLUG_CPU */ ++ ++static void set_rq_online(struct rq *rq) ++{ ++ if (!rq->online) ++ rq->online = true; ++} ++ ++#ifdef CONFIG_SCHED_DEBUG ++ ++static __read_mostly int sched_debug_enabled; ++ ++static int __init sched_debug_setup(char *str) ++{ ++ sched_debug_enabled = 1; ++ ++ return 0; ++} ++early_param("sched_debug", sched_debug_setup); ++ ++static inline bool sched_debug(void) ++{ ++ return sched_debug_enabled; ++} ++#else /* !CONFIG_SCHED_DEBUG */ ++static inline bool sched_debug(void) ++{ ++ return false; ++} ++#endif /* CONFIG_SCHED_DEBUG */ ++ ++#ifdef CONFIG_SMP ++void scheduler_ipi(void) ++{ ++ /* ++ * Fold TIF_NEED_RESCHED into the preempt_count; anybody setting ++ * TIF_NEED_RESCHED remotely (for the first time) will also send ++ * this IPI. ++ */ ++ preempt_fold_need_resched(); ++ ++ if (!idle_cpu(smp_processor_id()) || need_resched()) ++ return; ++ ++ irq_enter(); ++ irq_exit(); ++} ++ ++void wake_up_if_idle(int cpu) ++{ ++ struct rq *rq = cpu_rq(cpu); ++ unsigned long flags; ++ ++ rcu_read_lock(); ++ ++ if (!is_idle_task(rcu_dereference(rq->curr))) ++ goto out; ++ ++ if (set_nr_if_polling(rq->idle)) { ++ trace_sched_wake_idle_without_ipi(cpu); ++ } else { ++ raw_spin_lock_irqsave(&rq->lock, flags); ++ if (is_idle_task(rq->curr)) ++ smp_send_reschedule(cpu); ++ /* Else CPU is not idle, do nothing here */ ++ raw_spin_unlock_irqrestore(&rq->lock, flags); ++ } ++ ++out: ++ rcu_read_unlock(); ++} ++ ++bool cpus_share_cache(int this_cpu, int that_cpu) ++{ ++ return per_cpu(sd_llc_id, this_cpu) == per_cpu(sd_llc_id, that_cpu); ++} ++#endif /* CONFIG_SMP */ ++ ++/* ++ * Topology list, bottom-up. ++ */ ++static struct sched_domain_topology_level default_topology[] = { ++#ifdef CONFIG_SCHED_SMT ++ { cpu_smt_mask, cpu_smt_flags, SD_INIT_NAME(SMT) }, ++#endif ++#ifdef CONFIG_SCHED_MC ++ { cpu_coregroup_mask, cpu_core_flags, SD_INIT_NAME(MC) }, ++#endif ++ { cpu_cpu_mask, SD_INIT_NAME(DIE) }, ++ { NULL, }, ++}; ++ ++static struct sched_domain_topology_level *sched_domain_topology = ++ default_topology; ++ ++#define for_each_sd_topology(tl) \ ++ for (tl = sched_domain_topology; tl->mask; tl++) ++ ++void set_sched_topology(struct sched_domain_topology_level *tl) ++{ ++ if (WARN_ON_ONCE(sched_smp_initialized)) ++ return; ++ ++ sched_domain_topology = tl; ++} ++ ++/* ++ * Initializers for schedule domains ++ * Non-inlined to reduce accumulated stack pressure in build_sched_domains() ++ */ ++ ++int sched_domain_level_max; ++ ++/* ++ * Partition sched domains as specified by the 'ndoms_new' ++ * cpumasks in the array doms_new[] of cpumasks. This compares ++ * doms_new[] to the current sched domain partitioning, doms_cur[]. ++ * It destroys each deleted domain and builds each new domain. ++ * ++ * 'doms_new' is an array of cpumask_var_t's of length 'ndoms_new'. ++ * The masks don't intersect (don't overlap.) We should setup one ++ * sched domain for each mask. CPUs not in any of the cpumasks will ++ * not be load balanced. If the same cpumask appears both in the ++ * current 'doms_cur' domains and in the new 'doms_new', we can leave ++ * it as it is. ++ * ++ * The passed in 'doms_new' should be allocated using ++ * alloc_sched_domains. This routine takes ownership of it and will ++ * free_sched_domains it when done with it. If the caller failed the ++ * alloc call, then it can pass in doms_new == NULL && ndoms_new == 1, ++ * and partition_sched_domains() will fallback to the single partition ++ * 'fallback_doms', it also forces the domains to be rebuilt. ++ * ++ * If doms_new == NULL it will be replaced with cpu_online_mask. ++ * ndoms_new == 0 is a special case for destroying existing domains, ++ * and it will not create the default domain. ++ * ++ * Call with hotplug lock held ++ */ ++void partition_sched_domains(int ndoms_new, cpumask_var_t doms_new[], ++ struct sched_domain_attr *dattr_new) ++{ ++ /** ++ * PDS doesn't depend on sched domains, but just keep this api ++ */ ++} ++ ++/* ++ * used to mark begin/end of suspend/resume: ++ */ ++static int num_cpus_frozen; ++ ++#ifdef CONFIG_NUMA ++int __read_mostly node_reclaim_distance = RECLAIM_DISTANCE; ++ ++/* ++ * sched_numa_find_closest() - given the NUMA topology, find the cpu ++ * closest to @cpu from @cpumask. ++ * cpumask: cpumask to find a cpu from ++ * cpu: cpu to be close to ++ * ++ * returns: cpu, or nr_cpu_ids when nothing found. ++ */ ++int sched_numa_find_closest(const struct cpumask *cpus, int cpu) ++{ ++ return best_mask_cpu(cpu, cpus); ++} ++#endif /* CONFIG_NUMA */ ++ ++/* ++ * Update cpusets according to cpu_active mask. If cpusets are ++ * disabled, cpuset_update_active_cpus() becomes a simple wrapper ++ * around partition_sched_domains(). ++ * ++ * If we come here as part of a suspend/resume, don't touch cpusets because we ++ * want to restore it back to its original state upon resume anyway. ++ */ ++static void cpuset_cpu_active(void) ++{ ++ if (cpuhp_tasks_frozen) { ++ /* ++ * num_cpus_frozen tracks how many CPUs are involved in suspend ++ * resume sequence. As long as this is not the last online ++ * operation in the resume sequence, just build a single sched ++ * domain, ignoring cpusets. ++ */ ++ partition_sched_domains(1, NULL, NULL); ++ if (--num_cpus_frozen) ++ return; ++ /* ++ * This is the last CPU online operation. So fall through and ++ * restore the original sched domains by considering the ++ * cpuset configurations. ++ */ ++ cpuset_force_rebuild(); ++ } ++ ++ cpuset_update_active_cpus(); ++} ++ ++static int cpuset_cpu_inactive(unsigned int cpu) ++{ ++ if (!cpuhp_tasks_frozen) { ++ cpuset_update_active_cpus(); ++ } else { ++ num_cpus_frozen++; ++ partition_sched_domains(1, NULL, NULL); ++ } ++ return 0; ++} ++ ++int sched_cpu_activate(unsigned int cpu) ++{ ++ struct rq *rq = cpu_rq(cpu); ++ unsigned long flags; ++ ++#ifdef CONFIG_SCHED_SMT ++ /* ++ * When going up, increment the number of cores with SMT present. ++ */ ++ if (cpumask_weight(cpu_smt_mask(cpu)) == 2) ++ static_branch_inc_cpuslocked(&sched_smt_present); ++#endif ++ set_cpu_active(cpu, true); ++ ++ if (sched_smp_initialized) ++ cpuset_cpu_active(); ++ ++ /* ++ * Put the rq online, if not already. This happens: ++ * ++ * 1) In the early boot process, because we build the real domains ++ * after all cpus have been brought up. ++ * ++ * 2) At runtime, if cpuset_cpu_active() fails to rebuild the ++ * domains. ++ */ ++ raw_spin_lock_irqsave(&rq->lock, flags); ++ set_rq_online(rq); ++ raw_spin_unlock_irqrestore(&rq->lock, flags); ++ ++ return 0; ++} ++ ++int sched_cpu_deactivate(unsigned int cpu) ++{ ++ int ret; ++ ++ set_cpu_active(cpu, false); ++ /* ++ * We've cleared cpu_active_mask, wait for all preempt-disabled and RCU ++ * users of this state to go away such that all new such users will ++ * observe it. ++ * ++ * Do sync before park smpboot threads to take care the rcu boost case. ++ */ ++ synchronize_rcu(); ++ ++#ifdef CONFIG_SCHED_SMT ++ /* ++ * When going down, decrement the number of cores with SMT present. ++ */ ++ if (cpumask_weight(cpu_smt_mask(cpu)) == 2) ++ static_branch_dec_cpuslocked(&sched_smt_present); ++#endif ++ ++ if (!sched_smp_initialized) ++ return 0; ++ ++ ret = cpuset_cpu_inactive(cpu); ++ if (ret) { ++ set_cpu_active(cpu, true); ++ return ret; ++ } ++ return 0; ++} ++ ++static void sched_rq_cpu_starting(unsigned int cpu) ++{ ++ struct rq *rq = cpu_rq(cpu); ++ ++ rq->calc_load_update = calc_load_update; ++} ++ ++int sched_cpu_starting(unsigned int cpu) ++{ ++ sched_rq_cpu_starting(cpu); ++ sched_tick_start(cpu); ++ return 0; ++} ++ ++#ifdef CONFIG_HOTPLUG_CPU ++int sched_cpu_dying(unsigned int cpu) ++{ ++ struct rq *rq = cpu_rq(cpu); ++ unsigned long flags; ++ ++ sched_tick_stop(cpu); ++ raw_spin_lock_irqsave(&rq->lock, flags); ++ set_rq_offline(rq); ++ migrate_tasks(rq); ++ raw_spin_unlock_irqrestore(&rq->lock, flags); ++ ++ hrtick_clear(rq); ++ return 0; ++} ++#endif ++ ++#ifdef CONFIG_SMP ++static void sched_init_topology_cpumask_early(void) ++{ ++ int cpu, level; ++ cpumask_t *tmp; ++ ++ for_each_possible_cpu(cpu) { ++ for (level = 0; level < NR_CPU_AFFINITY_CHK_LEVEL; level++) { ++ tmp = &(per_cpu(sched_cpu_affinity_chk_masks, cpu)[level]); ++ cpumask_copy(tmp, cpu_possible_mask); ++ cpumask_clear_cpu(cpu, tmp); ++ } ++ per_cpu(sched_cpu_llc_start_mask, cpu) = ++ &(per_cpu(sched_cpu_affinity_chk_masks, cpu)[0]); ++ per_cpu(sched_cpu_affinity_chk_end_masks, cpu) = ++ &(per_cpu(sched_cpu_affinity_chk_masks, cpu)[1]); ++ } ++} ++ ++static void sched_init_topology_cpumask(void) ++{ ++ int cpu; ++ cpumask_t *chk; ++ ++ for_each_online_cpu(cpu) { ++ chk = &(per_cpu(sched_cpu_affinity_chk_masks, cpu)[0]); ++ ++#ifdef CONFIG_SCHED_SMT ++ cpumask_setall(chk); ++ cpumask_clear_cpu(cpu, chk); ++ if (cpumask_and(chk, chk, topology_sibling_cpumask(cpu))) { ++ per_cpu(sched_sibling_cpu, cpu) = cpumask_first(chk); ++ printk(KERN_INFO "pds: cpu #%d affinity check mask - smt 0x%08lx", ++ cpu, (chk++)->bits[0]); ++ } ++#endif ++#ifdef CONFIG_SCHED_MC ++ cpumask_setall(chk); ++ cpumask_clear_cpu(cpu, chk); ++ if (cpumask_and(chk, chk, cpu_coregroup_mask(cpu))) { ++ per_cpu(sched_cpu_llc_start_mask, cpu) = chk; ++ printk(KERN_INFO "pds: cpu #%d affinity check mask - coregroup 0x%08lx", ++ cpu, (chk++)->bits[0]); ++ } ++ cpumask_complement(chk, cpu_coregroup_mask(cpu)); ++ ++ /** ++ * Set up sd_llc_id per CPU ++ */ ++ per_cpu(sd_llc_id, cpu) = ++ cpumask_first(cpu_coregroup_mask(cpu)); ++#else ++ per_cpu(sd_llc_id, cpu) = ++ cpumask_first(topology_core_cpumask(cpu)); ++ ++ per_cpu(sched_cpu_llc_start_mask, cpu) = chk; ++ ++ cpumask_setall(chk); ++ cpumask_clear_cpu(cpu, chk); ++#endif /* NOT CONFIG_SCHED_MC */ ++ if (cpumask_and(chk, chk, topology_core_cpumask(cpu))) ++ printk(KERN_INFO "pds: cpu #%d affinity check mask - core 0x%08lx", ++ cpu, (chk++)->bits[0]); ++ cpumask_complement(chk, topology_core_cpumask(cpu)); ++ ++ if (cpumask_and(chk, chk, cpu_online_mask)) ++ printk(KERN_INFO "pds: cpu #%d affinity check mask - others 0x%08lx", ++ cpu, (chk++)->bits[0]); ++ ++ per_cpu(sched_cpu_affinity_chk_end_masks, cpu) = chk; ++ } ++} ++#endif ++ ++void __init sched_init_smp(void) ++{ ++ /* Move init over to a non-isolated CPU */ ++ if (set_cpus_allowed_ptr(current, housekeeping_cpumask(HK_FLAG_DOMAIN)) < 0) ++ BUG(); ++ ++ cpumask_copy(&sched_rq_queued_masks[SCHED_RQ_EMPTY], cpu_online_mask); ++ ++ sched_init_topology_cpumask(); ++ ++ sched_smp_initialized = true; ++} ++#else ++void __init sched_init_smp(void) ++{ ++} ++#endif /* CONFIG_SMP */ ++ ++int in_sched_functions(unsigned long addr) ++{ ++ return in_lock_functions(addr) || ++ (addr >= (unsigned long)__sched_text_start ++ && addr < (unsigned long)__sched_text_end); ++} ++ ++#ifdef CONFIG_CGROUP_SCHED ++/* task group related information */ ++struct task_group { ++ struct cgroup_subsys_state css; ++ ++ struct rcu_head rcu; ++ struct list_head list; ++ ++ struct task_group *parent; ++ struct list_head siblings; ++ struct list_head children; ++}; ++ ++/* ++ * Default task group. ++ * Every task in system belongs to this group at bootup. ++ */ ++struct task_group root_task_group; ++LIST_HEAD(task_groups); ++ ++/* Cacheline aligned slab cache for task_group */ ++static struct kmem_cache *task_group_cache __read_mostly; ++#endif /* CONFIG_CGROUP_SCHED */ ++ ++void __init sched_init(void) ++{ ++ int i; ++ struct rq *rq; ++ ++ print_scheduler_version(); ++ ++ wait_bit_init(); ++ ++#ifdef CONFIG_SMP ++ for (i = 0; i < NR_SCHED_RQ_QUEUED_LEVEL; i++) ++ cpumask_clear(&sched_rq_queued_masks[i]); ++ cpumask_setall(&sched_rq_queued_masks[SCHED_RQ_EMPTY]); ++ set_bit(SCHED_RQ_EMPTY, sched_rq_queued_masks_bitmap); ++ ++ cpumask_setall(&sched_rq_pending_masks[SCHED_RQ_EMPTY]); ++ set_bit(SCHED_RQ_EMPTY, sched_rq_pending_masks_bitmap); ++#else ++ uprq = &per_cpu(runqueues, 0); ++#endif ++ ++#ifdef CONFIG_CGROUP_SCHED ++ task_group_cache = KMEM_CACHE(task_group, 0); ++ ++ list_add(&root_task_group.list, &task_groups); ++ INIT_LIST_HEAD(&root_task_group.children); ++ INIT_LIST_HEAD(&root_task_group.siblings); ++#endif /* CONFIG_CGROUP_SCHED */ ++ for_each_possible_cpu(i) { ++ rq = cpu_rq(i); ++ FULL_INIT_SKIPLIST_NODE(&rq->sl_header); ++ raw_spin_lock_init(&rq->lock); ++ rq->dither = 0; ++ rq->nr_running = rq->nr_uninterruptible = 0; ++ rq->calc_load_active = 0; ++ rq->calc_load_update = jiffies + LOAD_FREQ; ++#ifdef CONFIG_SMP ++ rq->online = false; ++ rq->cpu = i; ++ ++ rq->queued_level = SCHED_RQ_EMPTY; ++ rq->pending_level = SCHED_RQ_EMPTY; ++#ifdef CONFIG_SCHED_SMT ++ per_cpu(sched_sibling_cpu, i) = i; ++ rq->active_balance = 0; ++#endif ++#endif ++ rq->nr_switches = 0; ++ atomic_set(&rq->nr_iowait, 0); ++ hrtick_rq_init(rq); ++ } ++#ifdef CONFIG_SMP ++ /* Set rq->online for cpu 0 */ ++ cpu_rq(0)->online = true; ++#endif ++ ++ /* ++ * The boot idle thread does lazy MMU switching as well: ++ */ ++ mmgrab(&init_mm); ++ enter_lazy_tlb(&init_mm, current); ++ ++ /* ++ * Make us the idle thread. Technically, schedule() should not be ++ * called from this thread, however somewhere below it might be, ++ * but because we are the idle thread, we just pick up running again ++ * when this runqueue becomes "idle". ++ */ ++ init_idle(current, smp_processor_id()); ++ ++ calc_load_update = jiffies + LOAD_FREQ; ++ ++#ifdef CONFIG_SMP ++ idle_thread_set_boot_cpu(); ++ ++ sched_init_topology_cpumask_early(); ++#endif /* SMP */ ++ ++ init_schedstats(); ++ ++ psi_init(); ++} ++ ++#ifdef CONFIG_DEBUG_ATOMIC_SLEEP ++static inline int preempt_count_equals(int preempt_offset) ++{ ++ int nested = preempt_count() + rcu_preempt_depth(); ++ ++ return (nested == preempt_offset); ++} ++ ++void __might_sleep(const char *file, int line, int preempt_offset) ++{ ++ /* ++ * Blocking primitives will set (and therefore destroy) current->state, ++ * since we will exit with TASK_RUNNING make sure we enter with it, ++ * otherwise we will destroy state. ++ */ ++ WARN_ONCE(current->state != TASK_RUNNING && current->task_state_change, ++ "do not call blocking ops when !TASK_RUNNING; " ++ "state=%lx set at [<%p>] %pS\n", ++ current->state, ++ (void *)current->task_state_change, ++ (void *)current->task_state_change); ++ ++ ___might_sleep(file, line, preempt_offset); ++} ++EXPORT_SYMBOL(__might_sleep); ++ ++void ___might_sleep(const char *file, int line, int preempt_offset) ++{ ++ /* Ratelimiting timestamp: */ ++ static unsigned long prev_jiffy; ++ ++ unsigned long preempt_disable_ip; ++ ++ /* WARN_ON_ONCE() by default, no rate limit required: */ ++ rcu_sleep_check(); ++ ++ if ((preempt_count_equals(preempt_offset) && !irqs_disabled() && ++ !is_idle_task(current) && !current->non_block_count) || ++ system_state == SYSTEM_BOOTING || system_state > SYSTEM_RUNNING || ++ oops_in_progress) ++ return; ++ if (time_before(jiffies, prev_jiffy + HZ) && prev_jiffy) ++ return; ++ prev_jiffy = jiffies; ++ ++ /* Save this before calling printk(), since that will clobber it: */ ++ preempt_disable_ip = get_preempt_disable_ip(current); ++ ++ printk(KERN_ERR ++ "BUG: sleeping function called from invalid context at %s:%d\n", ++ file, line); ++ printk(KERN_ERR ++ "in_atomic(): %d, irqs_disabled(): %d, non_block: %d, pid: %d, name: %s\n", ++ in_atomic(), irqs_disabled(), current->non_block_count, ++ current->pid, current->comm); ++ ++ if (task_stack_end_corrupted(current)) ++ printk(KERN_EMERG "Thread overran stack, or stack corrupted\n"); ++ ++ debug_show_held_locks(current); ++ if (irqs_disabled()) ++ print_irqtrace_events(current); ++#ifdef CONFIG_DEBUG_PREEMPT ++ if (!preempt_count_equals(preempt_offset)) { ++ pr_err("Preemption disabled at:"); ++ print_ip_sym(preempt_disable_ip); ++ pr_cont("\n"); ++ } ++#endif ++ dump_stack(); ++ add_taint(TAINT_WARN, LOCKDEP_STILL_OK); ++} ++EXPORT_SYMBOL(___might_sleep); ++ ++void __cant_sleep(const char *file, int line, int preempt_offset) ++{ ++ static unsigned long prev_jiffy; ++ ++ if (irqs_disabled()) ++ return; ++ ++ if (!IS_ENABLED(CONFIG_PREEMPT_COUNT)) ++ return; ++ ++ if (preempt_count() > preempt_offset) ++ return; ++ ++ if (time_before(jiffies, prev_jiffy + HZ) && prev_jiffy) ++ return; ++ prev_jiffy = jiffies; ++ ++ printk(KERN_ERR "BUG: assuming atomic context at %s:%d\n", file, line); ++ printk(KERN_ERR "in_atomic(): %d, irqs_disabled(): %d, pid: %d, name: %s\n", ++ in_atomic(), irqs_disabled(), ++ current->pid, current->comm); ++ ++ debug_show_held_locks(current); ++ dump_stack(); ++ add_taint(TAINT_WARN, LOCKDEP_STILL_OK); ++} ++EXPORT_SYMBOL_GPL(__cant_sleep); ++#endif ++ ++#ifdef CONFIG_MAGIC_SYSRQ ++void normalize_rt_tasks(void) ++{ ++ struct task_struct *g, *p; ++ struct sched_attr attr = { ++ .sched_policy = SCHED_NORMAL, ++ }; ++ ++ read_lock(&tasklist_lock); ++ for_each_process_thread(g, p) { ++ /* ++ * Only normalize user tasks: ++ */ ++ if (p->flags & PF_KTHREAD) ++ continue; ++ ++ if (!rt_task(p)) { ++ /* ++ * Renice negative nice level userspace ++ * tasks back to 0: ++ */ ++ if (task_nice(p) < 0) ++ set_user_nice(p, 0); ++ continue; ++ } ++ ++ __sched_setscheduler(p, &attr, false, false); ++ } ++ read_unlock(&tasklist_lock); ++} ++#endif /* CONFIG_MAGIC_SYSRQ */ ++ ++#if defined(CONFIG_IA64) || defined(CONFIG_KGDB_KDB) ++/* ++ * These functions are only useful for the IA64 MCA handling, or kdb. ++ * ++ * They can only be called when the whole system has been ++ * stopped - every CPU needs to be quiescent, and no scheduling ++ * activity can take place. Using them for anything else would ++ * be a serious bug, and as a result, they aren't even visible ++ * under any other configuration. ++ */ ++ ++/** ++ * curr_task - return the current task for a given CPU. ++ * @cpu: the processor in question. ++ * ++ * ONLY VALID WHEN THE WHOLE SYSTEM IS STOPPED! ++ * ++ * Return: The current task for @cpu. ++ */ ++struct task_struct *curr_task(int cpu) ++{ ++ return cpu_curr(cpu); ++} ++ ++#endif /* defined(CONFIG_IA64) || defined(CONFIG_KGDB_KDB) */ ++ ++#ifdef CONFIG_IA64 ++/** ++ * ia64_set_curr_task - set the current task for a given CPU. ++ * @cpu: the processor in question. ++ * @p: the task pointer to set. ++ * ++ * Description: This function must only be used when non-maskable interrupts ++ * are serviced on a separate stack. It allows the architecture to switch the ++ * notion of the current task on a CPU in a non-blocking manner. This function ++ * must be called with all CPU's synchronised, and interrupts disabled, the ++ * and caller must save the original value of the current task (see ++ * curr_task() above) and restore that value before reenabling interrupts and ++ * re-starting the system. ++ * ++ * ONLY VALID WHEN THE WHOLE SYSTEM IS STOPPED! ++ */ ++void ia64_set_curr_task(int cpu, struct task_struct *p) ++{ ++ cpu_curr(cpu) = p; ++} ++ ++#endif ++ ++#ifdef CONFIG_SCHED_DEBUG ++void proc_sched_show_task(struct task_struct *p, struct pid_namespace *ns, ++ struct seq_file *m) ++{} ++ ++void proc_sched_set_task(struct task_struct *p) ++{} ++#endif ++ ++#ifdef CONFIG_CGROUP_SCHED ++static void sched_free_group(struct task_group *tg) ++{ ++ kmem_cache_free(task_group_cache, tg); ++} ++ ++/* allocate runqueue etc for a new task group */ ++struct task_group *sched_create_group(struct task_group *parent) ++{ ++ struct task_group *tg; ++ ++ tg = kmem_cache_alloc(task_group_cache, GFP_KERNEL | __GFP_ZERO); ++ if (!tg) ++ return ERR_PTR(-ENOMEM); ++ ++ return tg; ++} ++ ++void sched_online_group(struct task_group *tg, struct task_group *parent) ++{ ++} ++ ++/* rcu callback to free various structures associated with a task group */ ++static void sched_free_group_rcu(struct rcu_head *rhp) ++{ ++ /* Now it should be safe to free those cfs_rqs */ ++ sched_free_group(container_of(rhp, struct task_group, rcu)); ++} ++ ++void sched_destroy_group(struct task_group *tg) ++{ ++ /* Wait for possible concurrent references to cfs_rqs complete */ ++ call_rcu(&tg->rcu, sched_free_group_rcu); ++} ++ ++void sched_offline_group(struct task_group *tg) ++{ ++} ++ ++static inline struct task_group *css_tg(struct cgroup_subsys_state *css) ++{ ++ return css ? container_of(css, struct task_group, css) : NULL; ++} ++ ++static struct cgroup_subsys_state * ++cpu_cgroup_css_alloc(struct cgroup_subsys_state *parent_css) ++{ ++ struct task_group *parent = css_tg(parent_css); ++ struct task_group *tg; ++ ++ if (!parent) { ++ /* This is early initialization for the top cgroup */ ++ return &root_task_group.css; ++ } ++ ++ tg = sched_create_group(parent); ++ if (IS_ERR(tg)) ++ return ERR_PTR(-ENOMEM); ++ return &tg->css; ++} ++ ++/* Expose task group only after completing cgroup initialization */ ++static int cpu_cgroup_css_online(struct cgroup_subsys_state *css) ++{ ++ struct task_group *tg = css_tg(css); ++ struct task_group *parent = css_tg(css->parent); ++ ++ if (parent) ++ sched_online_group(tg, parent); ++ return 0; ++} ++ ++static void cpu_cgroup_css_released(struct cgroup_subsys_state *css) ++{ ++ struct task_group *tg = css_tg(css); ++ ++ sched_offline_group(tg); ++} ++ ++static void cpu_cgroup_css_free(struct cgroup_subsys_state *css) ++{ ++ struct task_group *tg = css_tg(css); ++ ++ /* ++ * Relies on the RCU grace period between css_released() and this. ++ */ ++ sched_free_group(tg); ++} ++ ++static void cpu_cgroup_fork(struct task_struct *task) ++{ ++} ++ ++static int cpu_cgroup_can_attach(struct cgroup_taskset *tset) ++{ ++ return 0; ++} ++ ++static void cpu_cgroup_attach(struct cgroup_taskset *tset) ++{ ++} ++ ++static struct cftype cpu_legacy_files[] = { ++ { } /* Terminate */ ++}; ++ ++static struct cftype cpu_files[] = { ++ { } /* terminate */ ++}; ++ ++static int cpu_extra_stat_show(struct seq_file *sf, ++ struct cgroup_subsys_state *css) ++{ ++ return 0; ++} ++ ++struct cgroup_subsys cpu_cgrp_subsys = { ++ .css_alloc = cpu_cgroup_css_alloc, ++ .css_online = cpu_cgroup_css_online, ++ .css_released = cpu_cgroup_css_released, ++ .css_free = cpu_cgroup_css_free, ++ .css_extra_stat_show = cpu_extra_stat_show, ++ .fork = cpu_cgroup_fork, ++ .can_attach = cpu_cgroup_can_attach, ++ .attach = cpu_cgroup_attach, ++ .legacy_cftypes = cpu_files, ++ .legacy_cftypes = cpu_legacy_files, ++ .dfl_cftypes = cpu_files, ++ .early_init = true, ++ .threaded = true, ++}; ++#endif /* CONFIG_CGROUP_SCHED */ ++ ++#undef CREATE_TRACE_POINTS +diff --git a/kernel/sched/pds_sched.h b/kernel/sched/pds_sched.h +new file mode 100644 +index 000000000000..b3926a8425b2 +--- /dev/null ++++ b/kernel/sched/pds_sched.h +@@ -0,0 +1,481 @@ ++#ifndef PDS_SCHED_H ++#define PDS_SCHED_H ++ ++#include ++ ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++ ++#include ++ ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++ ++#include ++ ++#ifdef CONFIG_PARAVIRT ++# include ++#endif ++ ++#include "cpupri.h" ++ ++/* task_struct::on_rq states: */ ++#define TASK_ON_RQ_QUEUED 1 ++#define TASK_ON_RQ_MIGRATING 2 ++ ++static inline int task_on_rq_queued(struct task_struct *p) ++{ ++ return p->on_rq == TASK_ON_RQ_QUEUED; ++} ++ ++static inline int task_on_rq_migrating(struct task_struct *p) ++{ ++ return READ_ONCE(p->on_rq) == TASK_ON_RQ_MIGRATING; ++} ++ ++/* ++ * wake flags ++ */ ++#define WF_SYNC 0x01 /* waker goes to sleep after wakeup */ ++#define WF_FORK 0x02 /* child wakeup after fork */ ++#define WF_MIGRATED 0x04 /* internal use, task got migrated */ ++ ++/* ++ * This is the main, per-CPU runqueue data structure. ++ * This data should only be modified by the local cpu. ++ */ ++struct rq { ++ /* runqueue lock: */ ++ raw_spinlock_t lock; ++ ++ struct task_struct *curr, *idle, *stop; ++ struct mm_struct *prev_mm; ++ ++ struct skiplist_node sl_header; ++ ++ /* switch count */ ++ u64 nr_switches; ++ ++ atomic_t nr_iowait; ++ ++#ifdef CONFIG_MEMBARRIER ++ int membarrier_state; ++#endif ++ ++#ifdef CONFIG_SMP ++ int cpu; /* cpu of this runqueue */ ++ bool online; ++ ++#ifdef CONFIG_HAVE_SCHED_AVG_IRQ ++ struct sched_avg avg_irq; ++#endif ++ ++ unsigned long queued_level; ++ unsigned long pending_level; ++ ++#ifdef CONFIG_SCHED_SMT ++ int active_balance; ++ struct cpu_stop_work active_balance_work; ++#endif ++#endif /* CONFIG_SMP */ ++#ifdef CONFIG_IRQ_TIME_ACCOUNTING ++ u64 prev_irq_time; ++#endif /* CONFIG_IRQ_TIME_ACCOUNTING */ ++#ifdef CONFIG_PARAVIRT ++ u64 prev_steal_time; ++#endif /* CONFIG_PARAVIRT */ ++#ifdef CONFIG_PARAVIRT_TIME_ACCOUNTING ++ u64 prev_steal_time_rq; ++#endif /* CONFIG_PARAVIRT_TIME_ACCOUNTING */ ++ ++ /* calc_load related fields */ ++ unsigned long calc_load_update; ++ long calc_load_active; ++ ++ u64 clock, last_tick; ++ u64 clock_task; ++ int dither; ++ ++ unsigned long nr_running; ++ unsigned long nr_uninterruptible; ++ ++#ifdef CONFIG_SCHED_HRTICK ++#ifdef CONFIG_SMP ++ int hrtick_csd_pending; ++ call_single_data_t hrtick_csd; ++#endif ++ struct hrtimer hrtick_timer; ++#endif ++ ++#ifdef CONFIG_SCHEDSTATS ++ ++ /* latency stats */ ++ struct sched_info rq_sched_info; ++ unsigned long long rq_cpu_time; ++ /* could above be rq->cfs_rq.exec_clock + rq->rt_rq.rt_runtime ? */ ++ ++ /* sys_sched_yield() stats */ ++ unsigned int yld_count; ++ ++ /* schedule() stats */ ++ unsigned int sched_switch; ++ unsigned int sched_count; ++ unsigned int sched_goidle; ++ ++ /* try_to_wake_up() stats */ ++ unsigned int ttwu_count; ++ unsigned int ttwu_local; ++#endif /* CONFIG_SCHEDSTATS */ ++#ifdef CONFIG_CPU_IDLE ++ /* Must be inspected within a rcu lock section */ ++ struct cpuidle_state *idle_state; ++#endif ++}; ++ ++extern unsigned long calc_load_update; ++extern atomic_long_t calc_load_tasks; ++ ++extern void calc_global_load_tick(struct rq *this_rq); ++extern long calc_load_fold_active(struct rq *this_rq, long adjust); ++ ++#ifndef CONFIG_SMP ++extern struct rq *uprq; ++#define cpu_rq(cpu) (uprq) ++#define this_rq() (uprq) ++#define raw_rq() (uprq) ++#define task_rq(p) (uprq) ++#define cpu_curr(cpu) ((uprq)->curr) ++#else /* CONFIG_SMP */ ++DECLARE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues); ++#define cpu_rq(cpu) (&per_cpu(runqueues, (cpu))) ++#define this_rq() this_cpu_ptr(&runqueues) ++#define raw_rq() raw_cpu_ptr(&runqueues) ++#define task_rq(p) cpu_rq(task_cpu(p)) ++#define cpu_curr(cpu) (cpu_rq(cpu)->curr) ++ ++#if defined(CONFIG_SCHED_DEBUG) && defined(CONFIG_SYSCTL) ++void register_sched_domain_sysctl(void); ++void unregister_sched_domain_sysctl(void); ++#else ++static inline void register_sched_domain_sysctl(void) ++{ ++} ++static inline void unregister_sched_domain_sysctl(void) ++{ ++} ++#endif ++ ++#endif /* CONFIG_SMP */ ++ ++#ifndef arch_scale_freq_capacity ++static __always_inline ++unsigned long arch_scale_freq_capacity(int cpu) ++{ ++ return SCHED_CAPACITY_SCALE; ++} ++#endif ++ ++static inline u64 __rq_clock_broken(struct rq *rq) ++{ ++ return READ_ONCE(rq->clock); ++} ++ ++static inline u64 rq_clock(struct rq *rq) ++{ ++ /* ++ * Relax lockdep_assert_held() checking as in VRQ, call to ++ * sched_info_xxxx() may not held rq->lock ++ * lockdep_assert_held(&rq->lock); ++ */ ++ return rq->clock; ++} ++ ++static inline u64 rq_clock_task(struct rq *rq) ++{ ++ /* ++ * Relax lockdep_assert_held() checking as in VRQ, call to ++ * sched_info_xxxx() may not held rq->lock ++ * lockdep_assert_held(&rq->lock); ++ */ ++ return rq->clock_task; ++} ++ ++/* ++ * {de,en}queue flags: ++ * ++ * DEQUEUE_SLEEP - task is no longer runnable ++ * ENQUEUE_WAKEUP - task just became runnable ++ * ++ */ ++ ++#define DEQUEUE_SLEEP 0x01 ++ ++#define ENQUEUE_WAKEUP 0x01 ++ ++ ++/* ++ * Below are scheduler API which using in other kernel code ++ * It use the dummy rq_flags ++ * ToDo : PDS need to support these APIs for compatibility with mainline ++ * scheduler code. ++ */ ++struct rq_flags { ++ unsigned long flags; ++}; ++ ++struct rq *__task_rq_lock(struct task_struct *p, struct rq_flags *rf) ++ __acquires(rq->lock); ++ ++struct rq *task_rq_lock(struct task_struct *p, struct rq_flags *rf) ++ __acquires(p->pi_lock) ++ __acquires(rq->lock); ++ ++static inline void __task_rq_unlock(struct rq *rq, struct rq_flags *rf) ++ __releases(rq->lock) ++{ ++ raw_spin_unlock(&rq->lock); ++} ++ ++static inline void ++task_rq_unlock(struct rq *rq, struct task_struct *p, struct rq_flags *rf) ++ __releases(rq->lock) ++ __releases(p->pi_lock) ++{ ++ raw_spin_unlock(&rq->lock); ++ raw_spin_unlock_irqrestore(&p->pi_lock, rf->flags); ++} ++ ++static inline void ++rq_unlock_irq(struct rq *rq, struct rq_flags *rf) ++ __releases(rq->lock) ++{ ++ raw_spin_unlock_irq(&rq->lock); ++} ++ ++static inline struct rq * ++this_rq_lock_irq(struct rq_flags *rf) ++ __acquires(rq->lock) ++{ ++ struct rq *rq; ++ ++ local_irq_disable(); ++ rq = this_rq(); ++ raw_spin_lock(&rq->lock); ++ ++ return rq; ++} ++ ++static inline bool task_running(struct task_struct *p) ++{ ++ return p->on_cpu; ++} ++ ++extern struct static_key_false sched_schedstats; ++ ++static inline void sched_ttwu_pending(void) { } ++ ++#ifdef CONFIG_CPU_IDLE ++static inline void idle_set_state(struct rq *rq, ++ struct cpuidle_state *idle_state) ++{ ++ rq->idle_state = idle_state; ++} ++ ++static inline struct cpuidle_state *idle_get_state(struct rq *rq) ++{ ++ WARN_ON(!rcu_read_lock_held()); ++ return rq->idle_state; ++} ++#else ++static inline void idle_set_state(struct rq *rq, ++ struct cpuidle_state *idle_state) ++{ ++} ++ ++static inline struct cpuidle_state *idle_get_state(struct rq *rq) ++{ ++ return NULL; ++} ++#endif ++ ++static inline int cpu_of(const struct rq *rq) ++{ ++#ifdef CONFIG_SMP ++ return rq->cpu; ++#else ++ return 0; ++#endif ++} ++ ++#include "stats.h" ++ ++#ifdef CONFIG_IRQ_TIME_ACCOUNTING ++struct irqtime { ++ u64 total; ++ u64 tick_delta; ++ u64 irq_start_time; ++ struct u64_stats_sync sync; ++}; ++ ++DECLARE_PER_CPU(struct irqtime, cpu_irqtime); ++ ++/* ++ * Returns the irqtime minus the softirq time computed by ksoftirqd. ++ * Otherwise ksoftirqd's sum_exec_runtime is substracted its own runtime ++ * and never move forward. ++ */ ++static inline u64 irq_time_read(int cpu) ++{ ++ struct irqtime *irqtime = &per_cpu(cpu_irqtime, cpu); ++ unsigned int seq; ++ u64 total; ++ ++ do { ++ seq = __u64_stats_fetch_begin(&irqtime->sync); ++ total = irqtime->total; ++ } while (__u64_stats_fetch_retry(&irqtime->sync, seq)); ++ ++ return total; ++} ++#endif /* CONFIG_IRQ_TIME_ACCOUNTING */ ++ ++#ifdef CONFIG_CPU_FREQ ++DECLARE_PER_CPU(struct update_util_data __rcu *, cpufreq_update_util_data); ++ ++/** ++ * cpufreq_update_util - Take a note about CPU utilization changes. ++ * @rq: Runqueue to carry out the update for. ++ * @flags: Update reason flags. ++ * ++ * This function is called by the scheduler on the CPU whose utilization is ++ * being updated. ++ * ++ * It can only be called from RCU-sched read-side critical sections. ++ * ++ * The way cpufreq is currently arranged requires it to evaluate the CPU ++ * performance state (frequency/voltage) on a regular basis to prevent it from ++ * being stuck in a completely inadequate performance level for too long. ++ * That is not guaranteed to happen if the updates are only triggered from CFS ++ * and DL, though, because they may not be coming in if only RT tasks are ++ * active all the time (or there are RT tasks only). ++ * ++ * As a workaround for that issue, this function is called periodically by the ++ * RT sched class to trigger extra cpufreq updates to prevent it from stalling, ++ * but that really is a band-aid. Going forward it should be replaced with ++ * solutions targeted more specifically at RT tasks. ++ */ ++static inline void cpufreq_update_util(struct rq *rq, unsigned int flags) ++{ ++ struct update_util_data *data; ++ ++ data = rcu_dereference_sched(*this_cpu_ptr(&cpufreq_update_util_data)); ++ if (data) ++ data->func(data, rq_clock(rq), flags); ++} ++ ++static inline void cpufreq_update_this_cpu(struct rq *rq, unsigned int flags) ++{ ++ if (cpu_of(rq) == smp_processor_id()) ++ cpufreq_update_util(rq, flags); ++} ++#else ++static inline void cpufreq_update_util(struct rq *rq, unsigned int flags) {} ++static inline void cpufreq_update_this_cpu(struct rq *rq, unsigned int flags) {} ++#endif /* CONFIG_CPU_FREQ */ ++ ++#ifdef CONFIG_NO_HZ_FULL ++extern int __init sched_tick_offload_init(void); ++#else ++static inline int sched_tick_offload_init(void) { return 0; } ++#endif ++ ++#ifdef arch_scale_freq_capacity ++#ifndef arch_scale_freq_invariant ++#define arch_scale_freq_invariant() (true) ++#endif ++#else /* arch_scale_freq_capacity */ ++#define arch_scale_freq_invariant() (false) ++#endif ++ ++extern void schedule_idle(void); ++ ++/* ++ * !! For sched_setattr_nocheck() (kernel) only !! ++ * ++ * This is actually gross. :( ++ * ++ * It is used to make schedutil kworker(s) higher priority than SCHED_DEADLINE ++ * tasks, but still be able to sleep. We need this on platforms that cannot ++ * atomically change clock frequency. Remove once fast switching will be ++ * available on such platforms. ++ * ++ * SUGOV stands for SchedUtil GOVernor. ++ */ ++#define SCHED_FLAG_SUGOV 0x10000000 ++ ++#ifdef CONFIG_MEMBARRIER ++/* ++ * The scheduler provides memory barriers required by membarrier between: ++ * - prior user-space memory accesses and store to rq->membarrier_state, ++ * - store to rq->membarrier_state and following user-space memory accesses. ++ * In the same way it provides those guarantees around store to rq->curr. ++ */ ++static inline void membarrier_switch_mm(struct rq *rq, ++ struct mm_struct *prev_mm, ++ struct mm_struct *next_mm) ++{ ++ int membarrier_state; ++ ++ if (prev_mm == next_mm) ++ return; ++ ++ membarrier_state = atomic_read(&next_mm->membarrier_state); ++ if (READ_ONCE(rq->membarrier_state) == membarrier_state) ++ return; ++ ++ WRITE_ONCE(rq->membarrier_state, membarrier_state); ++} ++#else ++static inline void membarrier_switch_mm(struct rq *rq, ++ struct mm_struct *prev_mm, ++ struct mm_struct *next_mm) ++{ ++} ++#endif ++ ++#ifdef CONFIG_NUMA ++extern int sched_numa_find_closest(const struct cpumask *cpus, int cpu); ++#else ++static inline int sched_numa_find_closest(const struct cpumask *cpus, int cpu) ++{ ++ return nr_cpu_ids; ++} ++#endif ++#endif /* PDS_SCHED_H */ +diff --git a/kernel/sched/pelt.c b/kernel/sched/pelt.c +index a96db50d40e0..d3d12baa9036 100644 +--- a/kernel/sched/pelt.c ++++ b/kernel/sched/pelt.c +@@ -236,6 +236,7 @@ ___update_load_avg(struct sched_avg *sa, unsigned long load, unsigned long runna + WRITE_ONCE(sa->util_avg, sa->util_sum / divider); + } + ++#ifndef CONFIG_SCHED_PDS + /* + * sched_entity: + * +@@ -352,6 +353,7 @@ int update_dl_rq_load_avg(u64 now, struct rq *rq, int running) + + return 0; + } ++#endif + + #ifdef CONFIG_HAVE_SCHED_AVG_IRQ + /* +diff --git a/kernel/sched/pelt.h b/kernel/sched/pelt.h +index afff644da065..26d6b47fc156 100644 +--- a/kernel/sched/pelt.h ++++ b/kernel/sched/pelt.h +@@ -1,11 +1,13 @@ + #ifdef CONFIG_SMP + #include "sched-pelt.h" + ++#ifndef CONFIG_SCHED_PDS + int __update_load_avg_blocked_se(u64 now, struct sched_entity *se); + int __update_load_avg_se(u64 now, struct cfs_rq *cfs_rq, struct sched_entity *se); + int __update_load_avg_cfs_rq(u64 now, struct cfs_rq *cfs_rq); + int update_rt_rq_load_avg(u64 now, struct rq *rq, int running); + int update_dl_rq_load_avg(u64 now, struct rq *rq, int running); ++#endif + + #ifdef CONFIG_HAVE_SCHED_AVG_IRQ + int update_irq_load_avg(struct rq *rq, u64 running); +@@ -17,6 +19,7 @@ update_irq_load_avg(struct rq *rq, u64 running) + } + #endif + ++#ifndef CONFIG_SCHED_PDS + /* + * When a task is dequeued, its estimated utilization should not be update if + * its util_avg has not been updated at least once. +@@ -137,9 +140,11 @@ static inline u64 cfs_rq_clock_pelt(struct cfs_rq *cfs_rq) + return rq_clock_pelt(rq_of(cfs_rq)); + } + #endif ++#endif /* CONFIG_SCHED_PDS */ + + #else + ++#ifndef CONFIG_SCHED_PDS + static inline int + update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq) + { +@@ -157,6 +162,7 @@ update_dl_rq_load_avg(u64 now, struct rq *rq, int running) + { + return 0; + } ++#endif + + static inline int + update_irq_load_avg(struct rq *rq, u64 running) +diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h +index c8870c5bd7df..4fc9f2ead4d2 100644 +--- a/kernel/sched/sched.h ++++ b/kernel/sched/sched.h +@@ -2,6 +2,10 @@ + /* + * Scheduler internal types and methods: + */ ++#ifdef CONFIG_SCHED_PDS ++#include "pds_sched.h" ++#else ++ + #include + + #include +@@ -2496,3 +2500,4 @@ static inline void membarrier_switch_mm(struct rq *rq, + { + } + #endif ++#endif /* !CONFIG_SCHED_PDS */ +diff --git a/kernel/sched/stats.c b/kernel/sched/stats.c +index 750fb3c67eed..45bd43942575 100644 +--- a/kernel/sched/stats.c ++++ b/kernel/sched/stats.c +@@ -22,8 +22,10 @@ static int show_schedstat(struct seq_file *seq, void *v) + } else { + struct rq *rq; + #ifdef CONFIG_SMP ++#ifndef CONFIG_SCHED_PDS + struct sched_domain *sd; + int dcount = 0; ++#endif + #endif + cpu = (unsigned long)(v - 2); + rq = cpu_rq(cpu); +@@ -40,6 +42,7 @@ static int show_schedstat(struct seq_file *seq, void *v) + seq_printf(seq, "\n"); + + #ifdef CONFIG_SMP ++#ifndef CONFIG_SCHED_PDS + /* domain-specific stats */ + rcu_read_lock(); + for_each_domain(cpu, sd) { +@@ -68,6 +71,7 @@ static int show_schedstat(struct seq_file *seq, void *v) + sd->ttwu_move_balance); + } + rcu_read_unlock(); ++#endif + #endif + } + return 0; +diff --git a/kernel/sysctl.c b/kernel/sysctl.c +index b6f2f35d0bcf..204933ebc95a 100644 +--- a/kernel/sysctl.c ++++ b/kernel/sysctl.c +@@ -130,8 +130,12 @@ static int __maybe_unused four = 4; + static unsigned long zero_ul; + static unsigned long one_ul = 1; + static unsigned long long_max = LONG_MAX; +-static int one_hundred = 100; +-static int one_thousand = 1000; ++static int __read_mostly one_hundred = 100; ++static int __read_mostly one_thousand = 1000; ++#ifdef CONFIG_SCHED_PDS ++extern int rr_interval; ++extern int sched_yield_type; ++#endif + #ifdef CONFIG_PRINTK + static int ten_thousand = 10000; + #endif +@@ -300,7 +304,7 @@ static struct ctl_table sysctl_base_table[] = { + { } + }; + +-#ifdef CONFIG_SCHED_DEBUG ++#if defined(CONFIG_SCHED_DEBUG) && !defined(CONFIG_SCHED_PDS) + static int min_sched_granularity_ns = 100000; /* 100 usecs */ + static int max_sched_granularity_ns = NSEC_PER_SEC; /* 1 second */ + static int min_wakeup_granularity_ns; /* 0 usecs */ +@@ -317,6 +321,7 @@ static int max_extfrag_threshold = 1000; + #endif + + static struct ctl_table kern_table[] = { ++#ifndef CONFIG_SCHED_PDS + { + .procname = "sched_child_runs_first", + .data = &sysctl_sched_child_runs_first, +@@ -498,6 +503,7 @@ static struct ctl_table kern_table[] = { + .extra2 = SYSCTL_ONE, + }, + #endif ++#endif /* !CONFIG_SCHED_PDS */ + #ifdef CONFIG_PROVE_LOCKING + { + .procname = "prove_locking", +@@ -1070,6 +1076,26 @@ static struct ctl_table kern_table[] = { + .proc_handler = proc_dointvec, + }, + #endif ++#ifdef CONFIG_SCHED_PDS ++ { ++ .procname = "rr_interval", ++ .data = &rr_interval, ++ .maxlen = sizeof (int), ++ .mode = 0644, ++ .proc_handler = &proc_dointvec_minmax, ++ .extra1 = SYSCTL_ONE, ++ .extra2 = &one_thousand, ++ }, ++ { ++ .procname = "yield_type", ++ .data = &sched_yield_type, ++ .maxlen = sizeof (int), ++ .mode = 0644, ++ .proc_handler = &proc_dointvec_minmax, ++ .extra1 = SYSCTL_ZERO, ++ .extra2 = &two, ++ }, ++#endif + #if defined(CONFIG_S390) && defined(CONFIG_SMP) + { + .procname = "spin_retry", +diff --git a/kernel/time/posix-cpu-timers.c b/kernel/time/posix-cpu-timers.c +index 42d512fcfda2..71af3cd30ccc 100644 +--- a/kernel/time/posix-cpu-timers.c ++++ b/kernel/time/posix-cpu-timers.c +@@ -226,7 +226,7 @@ static void task_sample_cputime(struct task_struct *p, u64 *samples) + u64 stime, utime; + + task_cputime(p, &utime, &stime); +- store_samples(samples, stime, utime, p->se.sum_exec_runtime); ++ store_samples(samples, stime, utime, tsk_seruntime(p)); + } + + static void proc_sample_cputime_atomic(struct task_cputime_atomic *at, +@@ -796,6 +796,7 @@ static void collect_posix_cputimers(struct posix_cputimers *pct, u64 *samples, + } + } + ++#ifndef CONFIG_SCHED_PDS + static inline void check_dl_overrun(struct task_struct *tsk) + { + if (tsk->dl.dl_overrun) { +@@ -803,6 +804,7 @@ static inline void check_dl_overrun(struct task_struct *tsk) + __group_send_sig_info(SIGXCPU, SEND_SIG_PRIV, tsk); + } + } ++#endif + + static bool check_rlimit(u64 time, u64 limit, int signo, bool rt, bool hard) + { +@@ -830,8 +832,10 @@ static void check_thread_timers(struct task_struct *tsk, + u64 samples[CPUCLOCK_MAX]; + unsigned long soft; + ++#ifndef CONFIG_SCHED_PDS + if (dl_task(tsk)) + check_dl_overrun(tsk); ++#endif + + if (expiry_cache_is_inactive(pct)) + return; +@@ -845,7 +849,7 @@ static void check_thread_timers(struct task_struct *tsk, + soft = task_rlimit(tsk, RLIMIT_RTTIME); + if (soft != RLIM_INFINITY) { + /* Task RT timeout is accounted in jiffies. RTTIME is usec */ +- unsigned long rttime = tsk->rt.timeout * (USEC_PER_SEC / HZ); ++ unsigned long rttime = tsk_rttimeout(tsk) * (USEC_PER_SEC / HZ); + unsigned long hard = task_rlimit_max(tsk, RLIMIT_RTTIME); + + /* At the hard limit, send SIGKILL. No further action. */ +@@ -1099,8 +1103,10 @@ static inline bool fastpath_timer_check(struct task_struct *tsk) + return true; + } + ++#ifndef CONFIG_SCHED_PDS + if (dl_task(tsk) && tsk->dl.dl_overrun) + return true; ++#endif + + return false; + } +diff --git a/kernel/trace/trace_selftest.c b/kernel/trace/trace_selftest.c +index 69ee8ef12cee..3eaa2a21caa4 100644 +--- a/kernel/trace/trace_selftest.c ++++ b/kernel/trace/trace_selftest.c +@@ -1048,10 +1048,15 @@ static int trace_wakeup_test_thread(void *data) + { + /* Make this a -deadline thread */ + static const struct sched_attr attr = { ++#ifdef CONFIG_SCHED_PDS ++ /* No deadline on BFS, use RR */ ++ .sched_policy = SCHED_RR, ++#else + .sched_policy = SCHED_DEADLINE, + .sched_runtime = 100000ULL, + .sched_deadline = 10000000ULL, + .sched_period = 10000000ULL ++#endif + }; + struct wakeup_test_data *x = data; + diff --git a/linux55-tkg/linux55-tkg-patches/0006-add-acs-overrides_iommu.patch b/linux55-tkg/linux55-tkg-patches/0006-add-acs-overrides_iommu.patch new file mode 100644 index 0000000..7425de1 --- /dev/null +++ b/linux55-tkg/linux55-tkg-patches/0006-add-acs-overrides_iommu.patch @@ -0,0 +1,192 @@ +From cdeab384f48dd9c88e2dff2e9ad8d57dca1a1b1c Mon Sep 17 00:00:00 2001 +From: Mark Weiman +Date: Sun, 12 Aug 2018 11:36:21 -0400 +Subject: [PATCH] pci: Enable overrides for missing ACS capabilities + +This an updated version of Alex Williamson's patch from: +https://lkml.org/lkml/2013/5/30/513 + +Original commit message follows: + +PCIe ACS (Access Control Services) is the PCIe 2.0+ feature that +allows us to control whether transactions are allowed to be redirected +in various subnodes of a PCIe topology. For instance, if two +endpoints are below a root port or downsteam switch port, the +downstream port may optionally redirect transactions between the +devices, bypassing upstream devices. The same can happen internally +on multifunction devices. The transaction may never be visible to the +upstream devices. + +One upstream device that we particularly care about is the IOMMU. If +a redirection occurs in the topology below the IOMMU, then the IOMMU +cannot provide isolation between devices. This is why the PCIe spec +encourages topologies to include ACS support. Without it, we have to +assume peer-to-peer DMA within a hierarchy can bypass IOMMU isolation. + +Unfortunately, far too many topologies do not support ACS to make this +a steadfast requirement. Even the latest chipsets from Intel are only +sporadically supporting ACS. We have trouble getting interconnect +vendors to include the PCIe spec required PCIe capability, let alone +suggested features. + +Therefore, we need to add some flexibility. The pcie_acs_override= +boot option lets users opt-in specific devices or sets of devices to +assume ACS support. The "downstream" option assumes full ACS support +on root ports and downstream switch ports. The "multifunction" +option assumes the subset of ACS features available on multifunction +endpoints and upstream switch ports are supported. The "id:nnnn:nnnn" +option enables ACS support on devices matching the provided vendor +and device IDs, allowing more strategic ACS overrides. These options +may be combined in any order. A maximum of 16 id specific overrides +are available. It's suggested to use the most limited set of options +necessary to avoid completely disabling ACS across the topology. +Note to hardware vendors, we have facilities to permanently quirk +specific devices which enforce isolation but not provide an ACS +capability. Please contact me to have your devices added and save +your customers the hassle of this boot option. + +Signed-off-by: Mark Weiman +--- + .../admin-guide/kernel-parameters.txt | 9 ++ + drivers/pci/quirks.c | 101 ++++++++++++++++++ + 2 files changed, 110 insertions(+) + +diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt +index aefd358a5ca3..173b3596fd9e 100644 +--- a/Documentation/admin-guide/kernel-parameters.txt ++++ b/Documentation/admin-guide/kernel-parameters.txt +@@ -3190,6 +3190,15 @@ + nomsi [MSI] If the PCI_MSI kernel config parameter is + enabled, this kernel boot option can be used to + disable the use of MSI interrupts system-wide. ++ pcie_acs_override = ++ [PCIE] Override missing PCIe ACS support for: ++ downstream ++ All downstream ports - full ACS capabilities ++ multifunction ++ All multifunction devices - multifunction ACS subset ++ id:nnnn:nnnn ++ Specific device - full ACS capabilities ++ Specified as vid:did (vendor/device ID) in hex + noioapicquirk [APIC] Disable all boot interrupt quirks. + Safety option to keep boot IRQs enabled. This + should never be necessary. +diff --git a/drivers/pci/quirks.c b/drivers/pci/quirks.c +index 4700d24e5d55..8f7a3d7fd9c1 100644 +--- a/drivers/pci/quirks.c ++++ b/drivers/pci/quirks.c +@@ -3372,6 +3372,106 @@ static void quirk_no_bus_reset(struct pci_dev *dev) + dev->dev_flags |= PCI_DEV_FLAGS_NO_BUS_RESET; + } + ++static bool acs_on_downstream; ++static bool acs_on_multifunction; ++ ++#define NUM_ACS_IDS 16 ++struct acs_on_id { ++ unsigned short vendor; ++ unsigned short device; ++}; ++static struct acs_on_id acs_on_ids[NUM_ACS_IDS]; ++static u8 max_acs_id; ++ ++static __init int pcie_acs_override_setup(char *p) ++{ ++ if (!p) ++ return -EINVAL; ++ ++ while (*p) { ++ if (!strncmp(p, "downstream", 10)) ++ acs_on_downstream = true; ++ if (!strncmp(p, "multifunction", 13)) ++ acs_on_multifunction = true; ++ if (!strncmp(p, "id:", 3)) { ++ char opt[5]; ++ int ret; ++ long val; ++ ++ if (max_acs_id >= NUM_ACS_IDS - 1) { ++ pr_warn("Out of PCIe ACS override slots (%d)\n", ++ NUM_ACS_IDS); ++ goto next; ++ } ++ ++ p += 3; ++ snprintf(opt, 5, "%s", p); ++ ret = kstrtol(opt, 16, &val); ++ if (ret) { ++ pr_warn("PCIe ACS ID parse error %d\n", ret); ++ goto next; ++ } ++ acs_on_ids[max_acs_id].vendor = val; ++ ++ p += strcspn(p, ":"); ++ if (*p != ':') { ++ pr_warn("PCIe ACS invalid ID\n"); ++ goto next; ++ } ++ ++ p++; ++ snprintf(opt, 5, "%s", p); ++ ret = kstrtol(opt, 16, &val); ++ if (ret) { ++ pr_warn("PCIe ACS ID parse error %d\n", ret); ++ goto next; ++ } ++ acs_on_ids[max_acs_id].device = val; ++ max_acs_id++; ++ } ++next: ++ p += strcspn(p, ","); ++ if (*p == ',') ++ p++; ++ } ++ ++ if (acs_on_downstream || acs_on_multifunction || max_acs_id) ++ pr_warn("Warning: PCIe ACS overrides enabled; This may allow non-IOMMU protected peer-to-peer DMA\n"); ++ ++ return 0; ++} ++early_param("pcie_acs_override", pcie_acs_override_setup); ++ ++static int pcie_acs_overrides(struct pci_dev *dev, u16 acs_flags) ++{ ++ int i; ++ ++ /* Never override ACS for legacy devices or devices with ACS caps */ ++ if (!pci_is_pcie(dev) || ++ pci_find_ext_capability(dev, PCI_EXT_CAP_ID_ACS)) ++ return -ENOTTY; ++ ++ for (i = 0; i < max_acs_id; i++) ++ if (acs_on_ids[i].vendor == dev->vendor && ++ acs_on_ids[i].device == dev->device) ++ return 1; ++ ++ switch (pci_pcie_type(dev)) { ++ case PCI_EXP_TYPE_DOWNSTREAM: ++ case PCI_EXP_TYPE_ROOT_PORT: ++ if (acs_on_downstream) ++ return 1; ++ break; ++ case PCI_EXP_TYPE_ENDPOINT: ++ case PCI_EXP_TYPE_UPSTREAM: ++ case PCI_EXP_TYPE_LEG_END: ++ case PCI_EXP_TYPE_RC_END: ++ if (acs_on_multifunction && dev->multifunction) ++ return 1; ++ } ++ ++ return -ENOTTY; ++} + /* + * Some Atheros AR9xxx and QCA988x chips do not behave after a bus reset. + * The device will throw a Link Down error on AER-capable systems and +@@ -4513,6 +4613,7 @@ static const struct pci_dev_acs_enabled { + { PCI_VENDOR_ID_BROADCOM, 0xD714, pci_quirk_brcm_acs }, + /* Amazon Annapurna Labs */ + { PCI_VENDOR_ID_AMAZON_ANNAPURNA_LABS, 0x0031, pci_quirk_al_acs }, ++ { PCI_ANY_ID, PCI_ANY_ID, pcie_acs_overrides }, + { 0 } + }; + diff --git a/linux55-tkg/linux55-tkg-patches/0007-v5.5-fsync.patch b/linux55-tkg/linux55-tkg-patches/0007-v5.5-fsync.patch new file mode 100644 index 0000000..027116f --- /dev/null +++ b/linux55-tkg/linux55-tkg-patches/0007-v5.5-fsync.patch @@ -0,0 +1,419 @@ +split the futex key setup from the queue locking and key reading. This +is useful to support the setup of multiple keys at the same time, like +what is done in futex_requeue() and what will be done for the +FUTEX_WAIT_MULTIPLE command. + +Signed-off-by: Gabriel Krisman Bertazi +--- + kernel/futex.c | 71 +++++++++++++++++++++++++++++--------------------- + 1 file changed, 42 insertions(+), 29 deletions(-) + +diff --git a/kernel/futex.c b/kernel/futex.c +index 6d50728ef2e7..91f3db335c57 100644 +--- a/kernel/futex.c ++++ b/kernel/futex.c +@@ -2631,6 +2631,39 @@ static void futex_wait_queue_me(struct futex_hash_bucket *hb, struct futex_q *q, + __set_current_state(TASK_RUNNING); + } + ++static int __futex_wait_setup(u32 __user *uaddr, u32 val, unsigned int flags, ++ struct futex_q *q, struct futex_hash_bucket **hb) ++{ ++ ++ u32 uval; ++ int ret; ++ ++retry_private: ++ *hb = queue_lock(q); ++ ++ ret = get_futex_value_locked(&uval, uaddr); ++ ++ if (ret) { ++ queue_unlock(*hb); ++ ++ ret = get_user(uval, uaddr); ++ if (ret) ++ return ret; ++ ++ if (!(flags & FLAGS_SHARED)) ++ goto retry_private; ++ ++ return 1; ++ } ++ ++ if (uval != val) { ++ queue_unlock(*hb); ++ ret = -EWOULDBLOCK; ++ } ++ ++ return ret; ++} ++ + /** + * futex_wait_setup() - Prepare to wait on a futex + * @uaddr: the futex userspace address +@@ -2651,7 +2684,6 @@ static void futex_wait_queue_me(struct futex_hash_bucket *hb, struct futex_q *q, + static int futex_wait_setup(u32 __user *uaddr, u32 val, unsigned int flags, + struct futex_q *q, struct futex_hash_bucket **hb) + { +- u32 uval; + int ret; + + /* +@@ -2672,38 +2704,19 @@ static int futex_wait_setup(u32 __user *uaddr, u32 val, unsigned int flags, + * absorb a wakeup if *uaddr does not match the desired values + * while the syscall executes. + */ +-retry: +- ret = get_futex_key(uaddr, flags & FLAGS_SHARED, &q->key, FUTEX_READ); +- if (unlikely(ret != 0)) +- return ret; +- +-retry_private: +- *hb = queue_lock(q); ++ do { ++ ret = get_futex_key(uaddr, flags & FLAGS_SHARED, ++ &q->key, FUTEX_READ); ++ if (unlikely(ret != 0)) ++ return ret; + +- ret = get_futex_value_locked(&uval, uaddr); ++ ret = __futex_wait_setup(uaddr, val, flags, q, hb); + +- if (ret) { +- queue_unlock(*hb); +- +- ret = get_user(uval, uaddr); ++ /* Drop key reference if retry or error. */ + if (ret) +- goto out; ++ put_futex_key(&q->key); ++ } while (ret > 0); + +- if (!(flags & FLAGS_SHARED)) +- goto retry_private; +- +- put_futex_key(&q->key); +- goto retry; +- } +- +- if (uval != val) { +- queue_unlock(*hb); +- ret = -EWOULDBLOCK; +- } +- +-out: +- if (ret) +- put_futex_key(&q->key); + return ret; + } + +-- +2.20.1 + +This is a new futex operation, called FUTEX_WAIT_MULTIPLE, which allows +a thread to wait on several futexes at the same time, and be awoken by +any of them. In a sense, it implements one of the features that was +supported by pooling on the old FUTEX_FD interface. + +My use case for this operation lies in Wine, where we want to implement +a similar interface available in Windows, used mainly for event +handling. The wine folks have an implementation that uses eventfd, but +it suffers from FD exhaustion (I was told they have application that go +to the order of multi-milion FDs), and higher CPU utilization. + +In time, we are also proposing modifications to glibc and libpthread to +make this feature available for Linux native multithreaded applications +using libpthread, which can benefit from the behavior of waiting on any +of a group of futexes. + +In particular, using futexes in our Wine use case reduced the CPU +utilization by 4% for the game Beat Saber and by 1.5% for the game +Shadow of Tomb Raider, both running over Proton (a wine based solution +for Windows emulation), when compared to the eventfd interface. This +implementation also doesn't rely of file descriptors, so it doesn't risk +overflowing the resource. + +Technically, the existing FUTEX_WAIT implementation can be easily +reworked by using do_futex_wait_multiple with a count of one, and I +have a patch showing how it works. I'm not proposing it, since +futex is such a tricky code, that I'd be more confortable to have +FUTEX_WAIT_MULTIPLE running upstream for a couple development cycles, +before considering modifying FUTEX_WAIT. + +From an implementation perspective, the futex list is passed as an array +of (pointer,value,bitset) to the kernel, which will enqueue all of them +and sleep if none was already triggered. It returns a hint of which +futex caused the wake up event to userspace, but the hint doesn't +guarantee that is the only futex triggered. Before calling the syscall +again, userspace should traverse the list, trying to re-acquire any of +the other futexes, to prevent an immediate -EWOULDBLOCK return code from +the kernel. + +This was tested using three mechanisms: + +1) By reimplementing FUTEX_WAIT in terms of FUTEX_WAIT_MULTIPLE and +running the unmodified tools/testing/selftests/futex and a full linux +distro on top of this kernel. + +2) By an example code that exercises the FUTEX_WAIT_MULTIPLE path on a +multi-threaded, event-handling setup. + +3) By running the Wine fsync implementation and executing multi-threaded +applications, in particular the modern games mentioned above, on top of +this implementation. + +Signed-off-by: Zebediah Figura +Signed-off-by: Steven Noonan +Signed-off-by: Pierre-Loup A. Griffais +Signed-off-by: Gabriel Krisman Bertazi +--- + include/uapi/linux/futex.h | 7 ++ + kernel/futex.c | 161 ++++++++++++++++++++++++++++++++++++- + 2 files changed, 164 insertions(+), 4 deletions(-) + +diff --git a/include/uapi/linux/futex.h b/include/uapi/linux/futex.h +index a89eb0accd5e..2401c4cf5095 100644 +--- a/include/uapi/linux/futex.h ++++ b/include/uapi/linux/futex.h +@@ -21,6 +21,7 @@ + #define FUTEX_WAKE_BITSET 10 + #define FUTEX_WAIT_REQUEUE_PI 11 + #define FUTEX_CMP_REQUEUE_PI 12 ++#define FUTEX_WAIT_MULTIPLE 31 + + #define FUTEX_PRIVATE_FLAG 128 + #define FUTEX_CLOCK_REALTIME 256 +@@ -150,4 +151,10 @@ struct robust_list_head { + (((op & 0xf) << 28) | ((cmp & 0xf) << 24) \ + | ((oparg & 0xfff) << 12) | (cmparg & 0xfff)) + ++struct futex_wait_block { ++ __u32 __user *uaddr; ++ __u32 val; ++ __u32 bitset; ++}; ++ + #endif /* _UAPI_LINUX_FUTEX_H */ +diff --git a/kernel/futex.c b/kernel/futex.c +index 91f3db335c57..2623e8f152cd 100644 +--- a/kernel/futex.c ++++ b/kernel/futex.c +@@ -183,6 +183,7 @@ static int __read_mostly futex_cmpxchg_enabled; + #endif + #define FLAGS_CLOCKRT 0x02 + #define FLAGS_HAS_TIMEOUT 0x04 ++#define FLAGS_WAKE_MULTIPLE 0x08 + + /* + * Priority Inheritance state: +@@ -2720,6 +2721,150 @@ static int futex_wait_setup(u32 __user *uaddr, u32 val, unsigned int flags, + return ret; + } + ++static int do_futex_wait_multiple(struct futex_wait_block *wb, ++ u32 count, unsigned int flags, ++ ktime_t *abs_time) ++{ ++ ++ struct hrtimer_sleeper timeout, *to; ++ struct futex_hash_bucket *hb; ++ struct futex_q *qs = NULL; ++ int ret; ++ int i; ++ ++ qs = kcalloc(count, sizeof(struct futex_q), GFP_KERNEL); ++ if (!qs) ++ return -ENOMEM; ++ ++ to = futex_setup_timer(abs_time, &timeout, flags, ++ current->timer_slack_ns); ++ retry: ++ for (i = 0; i < count; i++) { ++ qs[i].key = FUTEX_KEY_INIT; ++ qs[i].bitset = wb[i].bitset; ++ ++ ret = get_futex_key(wb[i].uaddr, flags & FLAGS_SHARED, ++ &qs[i].key, FUTEX_READ); ++ if (unlikely(ret != 0)) { ++ for (--i; i >= 0; i--) ++ put_futex_key(&qs[i].key); ++ goto out; ++ } ++ } ++ ++ set_current_state(TASK_INTERRUPTIBLE); ++ ++ for (i = 0; i < count; i++) { ++ ret = __futex_wait_setup(wb[i].uaddr, wb[i].val, ++ flags, &qs[i], &hb); ++ if (ret) { ++ /* Drop the failed key directly. keys 0..(i-1) ++ * will be put by unqueue_me. ++ */ ++ put_futex_key(&qs[i].key); ++ ++ /* Undo the partial work we did. */ ++ for (--i; i >= 0; i--) ++ unqueue_me(&qs[i]); ++ ++ __set_current_state(TASK_RUNNING); ++ if (ret > 0) ++ goto retry; ++ goto out; ++ } ++ ++ /* We can't hold to the bucket lock when dealing with ++ * the next futex. Queue ourselves now so we can unlock ++ * it before moving on. ++ */ ++ queue_me(&qs[i], hb); ++ } ++ ++ if (to) ++ hrtimer_start_expires(&to->timer, HRTIMER_MODE_ABS); ++ ++ /* There is no easy to way to check if we are wake already on ++ * multiple futexes without waking through each one of them. So ++ * just sleep and let the scheduler handle it. ++ */ ++ if (!to || to->task) ++ freezable_schedule(); ++ ++ __set_current_state(TASK_RUNNING); ++ ++ ret = -ETIMEDOUT; ++ /* If we were woken (and unqueued), we succeeded. */ ++ for (i = 0; i < count; i++) ++ if (!unqueue_me(&qs[i])) ++ ret = i; ++ ++ /* Succeed wakeup */ ++ if (ret >= 0) ++ goto out; ++ ++ /* Woken by triggered timeout */ ++ if (to && !to->task) ++ goto out; ++ ++ /* ++ * We expect signal_pending(current), but we might be the ++ * victim of a spurious wakeup as well. ++ */ ++ if (!signal_pending(current)) ++ goto retry; ++ ++ ret = -ERESTARTSYS; ++ if (!abs_time) ++ goto out; ++ ++ ret = -ERESTART_RESTARTBLOCK; ++ out: ++ if (to) { ++ hrtimer_cancel(&to->timer); ++ destroy_hrtimer_on_stack(&to->timer); ++ } ++ ++ kfree(qs); ++ return ret; ++} ++ ++static int futex_wait_multiple(u32 __user *uaddr, unsigned int flags, ++ u32 count, ktime_t *abs_time) ++{ ++ struct futex_wait_block *wb; ++ struct restart_block *restart; ++ int ret; ++ ++ if (!count) ++ return -EINVAL; ++ ++ wb = kcalloc(count, sizeof(struct futex_wait_block), GFP_KERNEL); ++ if (!wb) ++ return -ENOMEM; ++ ++ if (copy_from_user(wb, uaddr, ++ count * sizeof(struct futex_wait_block))) { ++ ret = -EFAULT; ++ goto out; ++ } ++ ++ ret = do_futex_wait_multiple(wb, count, flags, abs_time); ++ ++ if (ret == -ERESTART_RESTARTBLOCK) { ++ restart = ¤t->restart_block; ++ restart->fn = futex_wait_restart; ++ restart->futex.uaddr = uaddr; ++ restart->futex.val = count; ++ restart->futex.time = *abs_time; ++ restart->futex.flags = (flags | FLAGS_HAS_TIMEOUT | ++ FLAGS_WAKE_MULTIPLE); ++ } ++ ++out: ++ kfree(wb); ++ return ret; ++} ++ + static int futex_wait(u32 __user *uaddr, unsigned int flags, u32 val, + ktime_t *abs_time, u32 bitset) + { +@@ -2797,6 +2942,10 @@ static long futex_wait_restart(struct restart_block *restart) + } + restart->fn = do_no_restart_syscall; + ++ if (restart->futex.flags & FLAGS_WAKE_MULTIPLE) ++ return (long)futex_wait_multiple(uaddr, restart->futex.flags, ++ restart->futex.val, tp); ++ + return (long)futex_wait(uaddr, restart->futex.flags, + restart->futex.val, tp, restart->futex.bitset); + } +@@ -3680,6 +3829,8 @@ long do_futex(u32 __user *uaddr, int op, u32 val, ktime_t *timeout, + uaddr2); + case FUTEX_CMP_REQUEUE_PI: + return futex_requeue(uaddr, flags, uaddr2, val, val2, &val3, 1); ++ case FUTEX_WAIT_MULTIPLE: ++ return futex_wait_multiple(uaddr, flags, val, timeout); + } + return -ENOSYS; + } +@@ -3696,7 +3847,8 @@ SYSCALL_DEFINE6(futex, u32 __user *, uaddr, int, op, u32, val, + + if (utime && (cmd == FUTEX_WAIT || cmd == FUTEX_LOCK_PI || + cmd == FUTEX_WAIT_BITSET || +- cmd == FUTEX_WAIT_REQUEUE_PI)) { ++ cmd == FUTEX_WAIT_REQUEUE_PI || ++ cmd == FUTEX_WAIT_MULTIPLE)) { + if (unlikely(should_fail_futex(!(op & FUTEX_PRIVATE_FLAG)))) + return -EFAULT; + if (get_timespec64(&ts, utime)) +@@ -3705,7 +3857,7 @@ SYSCALL_DEFINE6(futex, u32 __user *, uaddr, int, op, u32, val, + return -EINVAL; + + t = timespec64_to_ktime(ts); +- if (cmd == FUTEX_WAIT) ++ if (cmd == FUTEX_WAIT || cmd == FUTEX_WAIT_MULTIPLE) + t = ktime_add_safe(ktime_get(), t); + tp = &t; + } +@@ -3889,14 +4041,15 @@ SYSCALL_DEFINE6(futex_time32, u32 __user *, uaddr, int, op, u32, val, + + if (utime && (cmd == FUTEX_WAIT || cmd == FUTEX_LOCK_PI || + cmd == FUTEX_WAIT_BITSET || +- cmd == FUTEX_WAIT_REQUEUE_PI)) { ++ cmd == FUTEX_WAIT_REQUEUE_PI || ++ cmd == FUTEX_WAIT_MULTIPLE)) { + if (get_old_timespec32(&ts, utime)) + return -EFAULT; + if (!timespec64_valid(&ts)) + return -EINVAL; + + t = timespec64_to_ktime(ts); +- if (cmd == FUTEX_WAIT) ++ if (cmd == FUTEX_WAIT || cmd == FUTEX_WAIT_MULTIPLE) + t = ktime_add_safe(ktime_get(), t); + tp = &t; + } +-- +2.20.1 diff --git a/linux55-tkg/linux55-tkg-patches/0009-bmq_v5.5-r3.patch b/linux55-tkg/linux55-tkg-patches/0009-bmq_v5.5-r3.patch new file mode 100644 index 0000000..cf7bb1f --- /dev/null +++ b/linux55-tkg/linux55-tkg-patches/0009-bmq_v5.5-r3.patch @@ -0,0 +1,7624 @@ +diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt +index ade4e6ec23e0..80d796db0935 100644 +--- a/Documentation/admin-guide/kernel-parameters.txt ++++ b/Documentation/admin-guide/kernel-parameters.txt +@@ -432,6 +432,11 @@ + embedded devices based on command line input. + See Documentation/block/cmdline-partition.rst + ++ bmq.timeslice= [KNL] Time slice in us for BMQ scheduler. ++ Format: (must be >= 1000) ++ Default: 4000 ++ See Documentation/scheduler/sched-BMQ.txt ++ + boot_delay= Milliseconds to delay each printk during boot. + Values larger than 10 seconds (10000) are changed to + no delay (0). +diff --git a/Documentation/admin-guide/sysctl/kernel.rst b/Documentation/admin-guide/sysctl/kernel.rst +index def074807cee..e4bc9350f192 100644 +--- a/Documentation/admin-guide/sysctl/kernel.rst ++++ b/Documentation/admin-guide/sysctl/kernel.rst +@@ -105,6 +105,7 @@ show up in /proc/sys/kernel: + - unknown_nmi_panic + - watchdog + - watchdog_thresh ++- yield_type + - version + + +@@ -1173,3 +1174,13 @@ is 10 seconds. + + The softlockup threshold is (2 * watchdog_thresh). Setting this + tunable to zero will disable lockup detection altogether. ++ ++yield_type: ++=========== ++ ++BMQ CPU scheduler only. This determines what type of yield calls to ++sched_yield will perform. ++ ++ 0 - No yield. ++ 1 - Deboost and requeue task. (default) ++ 2 - Set run queue skip task. +diff --git a/Documentation/scheduler/sched-BMQ.txt b/Documentation/scheduler/sched-BMQ.txt +new file mode 100644 +index 000000000000..05c84eec0f31 +--- /dev/null ++++ b/Documentation/scheduler/sched-BMQ.txt +@@ -0,0 +1,110 @@ ++ BitMap queue CPU Scheduler ++ -------------------------- ++ ++CONTENT ++======== ++ ++ Background ++ Design ++ Overview ++ Task policy ++ Priority management ++ BitMap Queue ++ CPU Assignment and Migration ++ ++ ++Background ++========== ++ ++BitMap Queue CPU scheduler, referred to as BMQ from here on, is an evolution ++of previous Priority and Deadline based Skiplist multiple queue scheduler(PDS), ++and inspired by Zircon scheduler. The goal of it is to keep the scheduler code ++simple, while efficiency and scalable for interactive tasks, such as desktop, ++movie playback and gaming etc. ++ ++Design ++====== ++ ++Overview ++-------- ++ ++BMQ use per CPU run queue design, each CPU(logical) has it's own run queue, ++each CPU is responsible for scheduling the tasks that are putting into it's ++run queue. ++ ++The run queue is a set of priority queues. Note that these queues are fifo ++queue for non-rt tasks or priority queue for rt tasks in data structure. See ++BitMap Queue below for details. BMQ is optimized for non-rt tasks in the fact ++that most applications are non-rt tasks. No matter the queue is fifo or ++priority, In each queue is an ordered list of runnable tasks awaiting execution ++and the data structures are the same. When it is time for a new task to run, ++the scheduler simply looks the lowest numbered queueue that contains a task, ++and runs the first task from the head of that queue. And per CPU idle task is ++also in the run queue, so the scheduler can always find a task to run on from ++its run queue. ++ ++Each task will assigned the same timeslice(default 4ms) when it is picked to ++start running. Task will be reinserted at the end of the appropriate priority ++queue when it uses its whole timeslice. When the scheduler selects a new task ++from the priority queue it sets the CPU's preemption timer for the remainder of ++the previous timeslice. When that timer fires the scheduler will stop execution ++on that task, select another task and start over again. ++ ++If a task blocks waiting for a shared resource then it's taken out of its ++priority queue and is placed in a wait queue for the shared resource. When it ++is unblocked it will be reinserted in the appropriate priority queue of an ++eligible CPU. ++ ++Task policy ++----------- ++ ++BMQ supports DEADLINE, FIFO, RR, NORMAL, BATCH and IDLE task policy like the ++mainline CFS scheduler. But BMQ is heavy optimized for non-rt task, that's ++NORMAL/BATCH/IDLE policy tasks. Below is the implementation detail of each ++policy. ++ ++DEADLINE ++ It is squashed as priority 0 FIFO task. ++ ++FIFO/RR ++ All RT tasks share one single priority queue in BMQ run queue designed. The ++complexity of insert operation is O(n). BMQ is not designed for system runs ++with major rt policy tasks. ++ ++NORMAL/BATCH/IDLE ++ BATCH and IDLE tasks are treated as the same policy. They compete CPU with ++NORMAL policy tasks, but they just don't boost. To control the priority of ++NORMAL/BATCH/IDLE tasks, simply use nice level. ++ ++ISO ++ ISO policy is not supported in BMQ. Please use nice level -20 NORMAL policy ++task instead. ++ ++Priority management ++------------------- ++ ++RT tasks have priority from 0-99. For non-rt tasks, there are three different ++factors used to determine the effective priority of a task. The effective ++priority being what is used to determine which queue it will be in. ++ ++The first factor is simply the task’s static priority. Which is assigned from ++task's nice level, within [-20, 19] in userland's point of view and [0, 39] ++internally. ++ ++The second factor is the priority boost. This is a value bounded between ++[-MAX_PRIORITY_ADJ, MAX_PRIORITY_ADJ] used to offset the base priority, it is ++modified by the following cases: ++ ++*When a thread has used up its entire timeslice, always deboost its boost by ++increasing by one. ++*When a thread gives up cpu control(voluntary or non-voluntary) to reschedule, ++and its switch-in time(time after last switch and run) below the thredhold ++based on its priority boost, will boost its boost by decreasing by one buti is ++capped at 0 (won’t go negative). ++ ++The intent in this system is to ensure that interactive threads are serviced ++quickly. These are usually the threads that interact directly with the user ++and cause user-perceivable latency. These threads usually do little work and ++spend most of their time blocked awaiting another user event. So they get the ++priority boost from unblocking while background threads that do most of the ++processing receive the priority penalty for using their entire timeslice. +diff --git a/arch/powerpc/platforms/cell/spufs/sched.c b/arch/powerpc/platforms/cell/spufs/sched.c +index f18d5067cd0f..fe489fc01c73 100644 +--- a/arch/powerpc/platforms/cell/spufs/sched.c ++++ b/arch/powerpc/platforms/cell/spufs/sched.c +@@ -51,11 +51,6 @@ static struct task_struct *spusched_task; + static struct timer_list spusched_timer; + static struct timer_list spuloadavg_timer; + +-/* +- * Priority of a normal, non-rt, non-niced'd process (aka nice level 0). +- */ +-#define NORMAL_PRIO 120 +- + /* + * Frequency of the spu scheduler tick. By default we do one SPU scheduler + * tick for every 10 CPU scheduler ticks. +diff --git a/drivers/cpufreq/cpufreq_conservative.c b/drivers/cpufreq/cpufreq_conservative.c +index 737ff3b9c2c0..b5bc5a1b6de7 100644 +--- a/drivers/cpufreq/cpufreq_conservative.c ++++ b/drivers/cpufreq/cpufreq_conservative.c +@@ -28,8 +28,8 @@ struct cs_dbs_tuners { + }; + + /* Conservative governor macros */ +-#define DEF_FREQUENCY_UP_THRESHOLD (80) +-#define DEF_FREQUENCY_DOWN_THRESHOLD (20) ++#define DEF_FREQUENCY_UP_THRESHOLD (63) ++#define DEF_FREQUENCY_DOWN_THRESHOLD (26) + #define DEF_FREQUENCY_STEP (5) + #define DEF_SAMPLING_DOWN_FACTOR (1) + #define MAX_SAMPLING_DOWN_FACTOR (10) +diff --git a/drivers/cpufreq/cpufreq_ondemand.c b/drivers/cpufreq/cpufreq_ondemand.c +index 82a4d37ddecb..1130e0f5db72 100644 +--- a/drivers/cpufreq/cpufreq_ondemand.c ++++ b/drivers/cpufreq/cpufreq_ondemand.c +@@ -18,7 +18,7 @@ + #include "cpufreq_ondemand.h" + + /* On-demand governor macros */ +-#define DEF_FREQUENCY_UP_THRESHOLD (80) ++#define DEF_FREQUENCY_UP_THRESHOLD (63) + #define DEF_SAMPLING_DOWN_FACTOR (1) + #define MAX_SAMPLING_DOWN_FACTOR (100000) + #define MICRO_FREQUENCY_UP_THRESHOLD (95) +@@ -127,7 +127,7 @@ static void dbs_freq_increase(struct cpufreq_policy *policy, unsigned int freq) + } + + /* +- * Every sampling_rate, we check, if current idle time is less than 20% ++ * Every sampling_rate, we check, if current idle time is less than 37% + * (default), then we try to increase frequency. Else, we adjust the frequency + * proportional to load. + */ +diff --git a/fs/proc/base.c b/fs/proc/base.c +index ebea9501afb8..51c9346a69fe 100644 +--- a/fs/proc/base.c ++++ b/fs/proc/base.c +@@ -477,7 +477,7 @@ static int proc_pid_schedstat(struct seq_file *m, struct pid_namespace *ns, + seq_puts(m, "0 0 0\n"); + else + seq_printf(m, "%llu %llu %lu\n", +- (unsigned long long)task->se.sum_exec_runtime, ++ (unsigned long long)tsk_seruntime(task), + (unsigned long long)task->sched_info.run_delay, + task->sched_info.pcount); + +diff --git a/include/asm-generic/resource.h b/include/asm-generic/resource.h +index 8874f681b056..59eb72bf7d5f 100644 +--- a/include/asm-generic/resource.h ++++ b/include/asm-generic/resource.h +@@ -23,7 +23,7 @@ + [RLIMIT_LOCKS] = { RLIM_INFINITY, RLIM_INFINITY }, \ + [RLIMIT_SIGPENDING] = { 0, 0 }, \ + [RLIMIT_MSGQUEUE] = { MQ_BYTES_MAX, MQ_BYTES_MAX }, \ +- [RLIMIT_NICE] = { 0, 0 }, \ ++ [RLIMIT_NICE] = { 30, 30 }, \ + [RLIMIT_RTPRIO] = { 0, 0 }, \ + [RLIMIT_RTTIME] = { RLIM_INFINITY, RLIM_INFINITY }, \ + } +diff --git a/include/linux/jiffies.h b/include/linux/jiffies.h +index 1b6d31da7cbc..dea181bdb1dd 100644 +--- a/include/linux/jiffies.h ++++ b/include/linux/jiffies.h +@@ -171,7 +171,7 @@ static inline u64 get_jiffies_64(void) + * Have the 32 bit jiffies value wrap 5 minutes after boot + * so jiffies wrap bugs show up earlier. + */ +-#define INITIAL_JIFFIES ((unsigned long)(unsigned int) (-300*HZ)) ++#define INITIAL_JIFFIES ((unsigned long)(unsigned int) (-10*HZ)) + + /* + * Change timeval to jiffies, trying to avoid the +diff --git a/include/linux/sched.h b/include/linux/sched.h +index 716ad1d8d95e..9d08ce1d6e6c 100644 +--- a/include/linux/sched.h ++++ b/include/linux/sched.h +@@ -649,13 +649,18 @@ struct task_struct { + unsigned int flags; + unsigned int ptrace; + +-#ifdef CONFIG_SMP ++#if defined(CONFIG_SMP) && !defined(CONFIG_SCHED_BMQ) + struct llist_node wake_entry; ++#endif ++#if defined(CONFIG_SMP) || defined(CONFIG_SCHED_BMQ) + int on_cpu; ++#endif ++#ifdef CONFIG_SMP + #ifdef CONFIG_THREAD_INFO_IN_TASK + /* Current CPU: */ + unsigned int cpu; + #endif ++#ifndef CONFIG_SCHED_BMQ + unsigned int wakee_flips; + unsigned long wakee_flip_decay_ts; + struct task_struct *last_wakee; +@@ -669,6 +674,7 @@ struct task_struct { + */ + int recent_used_cpu; + int wake_cpu; ++#endif /* !CONFIG_SCHED_BMQ */ + #endif + int on_rq; + +@@ -677,13 +683,23 @@ struct task_struct { + int normal_prio; + unsigned int rt_priority; + ++#ifdef CONFIG_SCHED_BMQ ++ u64 last_ran; ++ s64 time_slice; ++ int boost_prio; ++ int bmq_idx; ++ struct list_head bmq_node; ++ /* sched_clock time spent running */ ++ u64 sched_time; ++#else /* !CONFIG_SCHED_BMQ */ + const struct sched_class *sched_class; + struct sched_entity se; + struct sched_rt_entity rt; ++ struct sched_dl_entity dl; ++#endif + #ifdef CONFIG_CGROUP_SCHED + struct task_group *sched_task_group; + #endif +- struct sched_dl_entity dl; + + #ifdef CONFIG_UCLAMP_TASK + /* Clamp values requested for a scheduling entity */ +@@ -1298,6 +1314,15 @@ struct task_struct { + */ + }; + ++#ifdef CONFIG_SCHED_BMQ ++#define tsk_seruntime(t) ((t)->sched_time) ++/* replace the uncertian rt_timeout with 0UL */ ++#define tsk_rttimeout(t) (0UL) ++#else /* CFS */ ++#define tsk_seruntime(t) ((t)->se.sum_exec_runtime) ++#define tsk_rttimeout(t) ((t)->rt.timeout) ++#endif /* !CONFIG_SCHED_BMQ */ ++ + static inline struct pid *task_pid(struct task_struct *task) + { + return task->thread_pid; +diff --git a/include/linux/sched/deadline.h b/include/linux/sched/deadline.h +index 1aff00b65f3c..02a3c5d34ee4 100644 +--- a/include/linux/sched/deadline.h ++++ b/include/linux/sched/deadline.h +@@ -1,5 +1,22 @@ + /* SPDX-License-Identifier: GPL-2.0 */ + ++#ifdef CONFIG_SCHED_BMQ ++ ++#define __tsk_deadline(p) (0UL) ++ ++static inline int dl_prio(int prio) ++{ ++ return 0; ++} ++ ++static inline int dl_task(struct task_struct *p) ++{ ++ return (SCHED_NORMAL == p->policy); ++} ++#else ++ ++#define __tsk_deadline(p) ((p)->dl.deadline) ++ + /* + * SCHED_DEADLINE tasks has negative priorities, reflecting + * the fact that any of them has higher prio than RT and +@@ -19,6 +36,7 @@ static inline int dl_task(struct task_struct *p) + { + return dl_prio(p->prio); + } ++#endif /* CONFIG_SCHED_BMQ */ + + static inline bool dl_time_before(u64 a, u64 b) + { +diff --git a/include/linux/sched/prio.h b/include/linux/sched/prio.h +index 7d64feafc408..d9dc5d3ccd2e 100644 +--- a/include/linux/sched/prio.h ++++ b/include/linux/sched/prio.h +@@ -20,11 +20,17 @@ + */ + + #define MAX_USER_RT_PRIO 100 ++ + #define MAX_RT_PRIO MAX_USER_RT_PRIO + + #define MAX_PRIO (MAX_RT_PRIO + NICE_WIDTH) + #define DEFAULT_PRIO (MAX_RT_PRIO + NICE_WIDTH / 2) + ++#ifdef CONFIG_SCHED_BMQ ++/* +/- priority levels from the base priority */ ++#define MAX_PRIORITY_ADJ 4 ++#endif ++ + /* + * Convert user-nice values [ -20 ... 0 ... 19 ] + * to static priority [ MAX_RT_PRIO..MAX_PRIO-1 ], +diff --git a/include/linux/sched/rt.h b/include/linux/sched/rt.h +index e5af028c08b4..6387c8ea9832 100644 +--- a/include/linux/sched/rt.h ++++ b/include/linux/sched/rt.h +@@ -24,8 +24,10 @@ static inline bool task_is_realtime(struct task_struct *tsk) + + if (policy == SCHED_FIFO || policy == SCHED_RR) + return true; ++#ifndef CONFIG_SCHED_BMQ + if (policy == SCHED_DEADLINE) + return true; ++#endif + return false; + } + +diff --git a/init/Kconfig b/init/Kconfig +index a34064a031a5..256e555538b7 100644 +--- a/init/Kconfig ++++ b/init/Kconfig +@@ -697,9 +697,20 @@ config GENERIC_SCHED_CLOCK + + menu "Scheduler features" + ++config SCHED_BMQ ++ bool "BMQ CPU scheduler" ++ help ++ The BitMap Queue CPU scheduler for excellent interactivity and ++ responsiveness on the desktop and solid scalability on normal ++ hardware and commodity servers. ++ ++ Say Y here. ++ default y ++ + config UCLAMP_TASK + bool "Enable utilization clamping for RT/FAIR tasks" + depends on CPU_FREQ_GOV_SCHEDUTIL ++ depends on !SCHED_BMQ + help + This feature enables the scheduler to track the clamped utilization + of each CPU based on RUNNABLE tasks scheduled on that CPU. +@@ -786,6 +797,7 @@ config NUMA_BALANCING + depends on ARCH_SUPPORTS_NUMA_BALANCING + depends on !ARCH_WANT_NUMA_VARIABLE_LOCALITY + depends on SMP && NUMA && MIGRATION ++ depends on !SCHED_BMQ + help + This option adds support for automatic NUMA aware memory/task placement. + The mechanism is quite primitive and is based on migrating memory when +@@ -887,7 +899,7 @@ menuconfig CGROUP_SCHED + bandwidth allocation to such task groups. It uses cgroups to group + tasks. + +-if CGROUP_SCHED ++if CGROUP_SCHED && !SCHED_BMQ + config FAIR_GROUP_SCHED + bool "Group scheduling for SCHED_OTHER" + depends on CGROUP_SCHED +@@ -1134,6 +1146,7 @@ config CHECKPOINT_RESTORE + + config SCHED_AUTOGROUP + bool "Automatic process group scheduling" ++ depends on !SCHED_BMQ + select CGROUPS + select CGROUP_SCHED + select FAIR_GROUP_SCHED +diff --git a/init/init_task.c b/init/init_task.c +index 9e5cbe5eab7b..c293de91d90f 100644 +--- a/init/init_task.c ++++ b/init/init_task.c +@@ -66,9 +66,15 @@ struct task_struct init_task + .stack = init_stack, + .usage = REFCOUNT_INIT(2), + .flags = PF_KTHREAD, ++#ifdef CONFIG_SCHED_BMQ ++ .prio = DEFAULT_PRIO + MAX_PRIORITY_ADJ, ++ .static_prio = DEFAULT_PRIO, ++ .normal_prio = DEFAULT_PRIO + MAX_PRIORITY_ADJ, ++#else + .prio = MAX_PRIO - 20, + .static_prio = MAX_PRIO - 20, + .normal_prio = MAX_PRIO - 20, ++#endif + .policy = SCHED_NORMAL, + .cpus_ptr = &init_task.cpus_mask, + .cpus_mask = CPU_MASK_ALL, +@@ -78,6 +84,12 @@ struct task_struct init_task + .restart_block = { + .fn = do_no_restart_syscall, + }, ++#ifdef CONFIG_SCHED_BMQ ++ .boost_prio = 0, ++ .bmq_idx = 15, ++ .bmq_node = LIST_HEAD_INIT(init_task.bmq_node), ++ .time_slice = HZ, ++#else + .se = { + .group_node = LIST_HEAD_INIT(init_task.se.group_node), + }, +@@ -85,6 +97,7 @@ struct task_struct init_task + .run_list = LIST_HEAD_INIT(init_task.rt.run_list), + .time_slice = RR_TIMESLICE, + }, ++#endif + .tasks = LIST_HEAD_INIT(init_task.tasks), + #ifdef CONFIG_SMP + .pushable_tasks = PLIST_NODE_INIT(init_task.pushable_tasks, MAX_PRIO), +diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c +index 58f5073acff7..9301e25986d3 100644 +--- a/kernel/cgroup/cpuset.c ++++ b/kernel/cgroup/cpuset.c +@@ -632,7 +632,7 @@ static int validate_change(struct cpuset *cur, struct cpuset *trial) + return ret; + } + +-#ifdef CONFIG_SMP ++#if defined(CONFIG_SMP) && !defined(CONFIG_SCHED_BMQ) + /* + * Helper routine for generate_sched_domains(). + * Do cpusets a, b have overlapping effective cpus_allowed masks? +@@ -1005,7 +1005,7 @@ static void rebuild_sched_domains_locked(void) + /* Have scheduler rebuild the domains */ + partition_and_rebuild_sched_domains(ndoms, doms, attr); + } +-#else /* !CONFIG_SMP */ ++#else /* !CONFIG_SMP || CONFIG_SCHED_BMQ */ + static void rebuild_sched_domains_locked(void) + { + } +diff --git a/kernel/delayacct.c b/kernel/delayacct.c +index 27725754ac99..769d773c7182 100644 +--- a/kernel/delayacct.c ++++ b/kernel/delayacct.c +@@ -106,7 +106,7 @@ int __delayacct_add_tsk(struct taskstats *d, struct task_struct *tsk) + */ + t1 = tsk->sched_info.pcount; + t2 = tsk->sched_info.run_delay; +- t3 = tsk->se.sum_exec_runtime; ++ t3 = tsk_seruntime(tsk); + + d->cpu_count += t1; + +diff --git a/kernel/exit.c b/kernel/exit.c +index 2833ffb0c211..37a1f8d73eee 100644 +--- a/kernel/exit.c ++++ b/kernel/exit.c +@@ -131,7 +131,7 @@ static void __exit_signal(struct task_struct *tsk) + sig->curr_target = next_thread(tsk); + } + +- add_device_randomness((const void*) &tsk->se.sum_exec_runtime, ++ add_device_randomness((const void*) &tsk_seruntime(tsk), + sizeof(unsigned long long)); + + /* +@@ -152,7 +152,7 @@ static void __exit_signal(struct task_struct *tsk) + sig->inblock += task_io_get_inblock(tsk); + sig->oublock += task_io_get_oublock(tsk); + task_io_accounting_add(&sig->ioac, &tsk->ioac); +- sig->sum_sched_runtime += tsk->se.sum_exec_runtime; ++ sig->sum_sched_runtime += tsk_seruntime(tsk); + sig->nr_threads--; + __unhash_process(tsk, group_dead); + write_sequnlock(&sig->stats_lock); +diff --git a/kernel/livepatch/transition.c b/kernel/livepatch/transition.c +index f6310f848f34..3ad290e9fed8 100644 +--- a/kernel/livepatch/transition.c ++++ b/kernel/livepatch/transition.c +@@ -306,7 +306,11 @@ static bool klp_try_switch_task(struct task_struct *task) + */ + rq = task_rq_lock(task, &flags); + ++#ifdef CONFIG_SCHED_BMQ ++ if (task_running(task) && task != current) { ++#else + if (task_running(rq, task) && task != current) { ++#endif + snprintf(err_buf, STACK_ERR_BUF_SIZE, + "%s: %s:%d is running\n", __func__, task->comm, + task->pid); +diff --git a/kernel/locking/rtmutex.c b/kernel/locking/rtmutex.c +index 851bbb10819d..019fdab7e329 100644 +--- a/kernel/locking/rtmutex.c ++++ b/kernel/locking/rtmutex.c +@@ -229,7 +229,7 @@ static inline bool unlock_rt_mutex_safe(struct rt_mutex *lock, + * Only use with rt_mutex_waiter_{less,equal}() + */ + #define task_to_waiter(p) \ +- &(struct rt_mutex_waiter){ .prio = (p)->prio, .deadline = (p)->dl.deadline } ++ &(struct rt_mutex_waiter){ .prio = (p)->prio, .deadline = __tsk_deadline(p) } + + static inline int + rt_mutex_waiter_less(struct rt_mutex_waiter *left, +@@ -680,7 +680,7 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task, + * the values of the node being removed. + */ + waiter->prio = task->prio; +- waiter->deadline = task->dl.deadline; ++ waiter->deadline = __tsk_deadline(task); + + rt_mutex_enqueue(lock, waiter); + +@@ -953,7 +953,7 @@ static int task_blocks_on_rt_mutex(struct rt_mutex *lock, + waiter->task = task; + waiter->lock = lock; + waiter->prio = task->prio; +- waiter->deadline = task->dl.deadline; ++ waiter->deadline = __tsk_deadline(task); + + /* Get the top priority waiter on the lock */ + if (rt_mutex_has_waiters(lock)) +diff --git a/kernel/sched/Makefile b/kernel/sched/Makefile +index 21fb5a5662b5..ac31239aa51a 100644 +--- a/kernel/sched/Makefile ++++ b/kernel/sched/Makefile +@@ -16,14 +16,20 @@ ifneq ($(CONFIG_SCHED_OMIT_FRAME_POINTER),y) + CFLAGS_core.o := $(PROFILING) -fno-omit-frame-pointer + endif + +-obj-y += core.o loadavg.o clock.o cputime.o +-obj-y += idle.o fair.o rt.o deadline.o +-obj-y += wait.o wait_bit.o swait.o completion.o +- +-obj-$(CONFIG_SMP) += cpupri.o cpudeadline.o topology.o stop_task.o pelt.o ++ifdef CONFIG_SCHED_BMQ ++obj-y += bmq.o bmq_debug.o ++else ++obj-y += core.o ++obj-y += fair.o rt.o deadline.o ++obj-$(CONFIG_SMP) += cpudeadline.o stop_task.o + obj-$(CONFIG_SCHED_AUTOGROUP) += autogroup.o +-obj-$(CONFIG_SCHEDSTATS) += stats.o + obj-$(CONFIG_SCHED_DEBUG) += debug.o ++endif ++obj-y += loadavg.o clock.o cputime.o ++obj-y += idle.o ++obj-y += wait.o wait_bit.o swait.o completion.o ++obj-$(CONFIG_SMP) += cpupri.o pelt.o topology.o ++obj-$(CONFIG_SCHEDSTATS) += stats.o + obj-$(CONFIG_CGROUP_CPUACCT) += cpuacct.o + obj-$(CONFIG_CPU_FREQ) += cpufreq.o + obj-$(CONFIG_CPU_FREQ_GOV_SCHEDUTIL) += cpufreq_schedutil.o +diff --git a/kernel/sched/bmq.c b/kernel/sched/bmq.c +new file mode 100644 +index 000000000000..6a5ab93a30bb +--- /dev/null ++++ b/kernel/sched/bmq.c +@@ -0,0 +1,5999 @@ ++/* ++ * kernel/sched/bmq.c ++ * ++ * BMQ Core kernel scheduler code and related syscalls ++ * ++ * Copyright (C) 1991-2002 Linus Torvalds ++ * ++ * 2009-08-13 Brainfuck deadline scheduling policy by Con Kolivas deletes ++ * a whole lot of those previous things. ++ * 2017-09-06 Priority and Deadline based Skip list multiple queue kernel ++ * scheduler by Alfred Chen. ++ * 2019-02-20 BMQ(BitMap Queue) kernel scheduler by Alfred Chen. ++ */ ++#include "bmq_sched.h" ++ ++#include ++ ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++ ++#include ++ ++#include ++ ++#include "../workqueue_internal.h" ++#include "../../fs/io-wq.h" ++#include "../smpboot.h" ++ ++#include "pelt.h" ++ ++#define CREATE_TRACE_POINTS ++#include ++ ++/* rt_prio(prio) defined in include/linux/sched/rt.h */ ++#define rt_task(p) rt_prio((p)->prio) ++#define rt_policy(policy) ((policy) == SCHED_FIFO || (policy) == SCHED_RR) ++#define task_has_rt_policy(p) (rt_policy((p)->policy)) ++ ++#define STOP_PRIO (MAX_RT_PRIO - 1) ++ ++/* Default time slice is 4 in ms, can be set via kernel parameter "bmq.timeslice" */ ++u64 sched_timeslice_ns __read_mostly = (4 * 1000 * 1000); ++ ++static int __init sched_timeslice(char *str) ++{ ++ int timeslice_us; ++ ++ get_option(&str, ×lice_us); ++ if (timeslice_us >= 1000) ++ sched_timeslice_ns = timeslice_us * 1000; ++ ++ return 0; ++} ++early_param("bmq.timeslice", sched_timeslice); ++ ++/* Reschedule if less than this many μs left */ ++#define RESCHED_NS (100 * 1000) ++ ++static inline void print_scheduler_version(void) ++{ ++ printk(KERN_INFO "bmq: BMQ CPU Scheduler 5.5-r3 by Alfred Chen.\n"); ++} ++ ++/** ++ * sched_yield_type - Choose what sort of yield sched_yield will perform. ++ * 0: No yield. ++ * 1: Deboost and requeue task. (default) ++ * 2: Set rq skip task. ++ */ ++int sched_yield_type __read_mostly = 1; ++ ++#define rq_switch_time(rq) ((rq)->clock - (rq)->last_ts_switch) ++#define boost_threshold(p) (sched_timeslice_ns >>\ ++ (10 - MAX_PRIORITY_ADJ - (p)->boost_prio)) ++ ++static inline void boost_task(struct task_struct *p) ++{ ++ int limit; ++ ++ switch (p->policy) { ++ case SCHED_NORMAL: ++ limit = -MAX_PRIORITY_ADJ; ++ break; ++ case SCHED_BATCH: ++ case SCHED_IDLE: ++ limit = 0; ++ break; ++ default: ++ return; ++ } ++ ++ if (p->boost_prio > limit) ++ p->boost_prio--; ++} ++ ++static inline void deboost_task(struct task_struct *p) ++{ ++ if (p->boost_prio < MAX_PRIORITY_ADJ) ++ p->boost_prio++; ++} ++ ++#ifdef CONFIG_SMP ++static cpumask_t sched_rq_pending_mask ____cacheline_aligned_in_smp; ++ ++DEFINE_PER_CPU(cpumask_t [NR_CPU_AFFINITY_CHK_LEVEL], sched_cpu_affinity_masks); ++DEFINE_PER_CPU(cpumask_t *, sched_cpu_affinity_end_mask); ++DEFINE_PER_CPU(cpumask_t *, sched_cpu_llc_mask); ++ ++#ifdef CONFIG_SCHED_SMT ++DEFINE_STATIC_KEY_FALSE(sched_smt_present); ++EXPORT_SYMBOL_GPL(sched_smt_present); ++#endif ++ ++/* ++ * Keep a unique ID per domain (we use the first CPUs number in the cpumask of ++ * the domain), this allows us to quickly tell if two cpus are in the same cache ++ * domain, see cpus_share_cache(). ++ */ ++DEFINE_PER_CPU(int, sd_llc_id); ++#endif /* CONFIG_SMP */ ++ ++static DEFINE_MUTEX(sched_hotcpu_mutex); ++ ++DEFINE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues); ++ ++#ifndef prepare_arch_switch ++# define prepare_arch_switch(next) do { } while (0) ++#endif ++#ifndef finish_arch_post_lock_switch ++# define finish_arch_post_lock_switch() do { } while (0) ++#endif ++ ++#define IDLE_WM (IDLE_TASK_SCHED_PRIO) ++ ++static cpumask_t sched_sg_idle_mask ____cacheline_aligned_in_smp; ++static cpumask_t sched_rq_watermark[bmq_BITS] ____cacheline_aligned_in_smp; ++ ++#if (bmq_BITS <= BITS_PER_LONG) ++#define bmq_find_first_bit(bm) __ffs((bm[0])) ++#define bmq_find_next_bit(bm, start) __ffs(BITMAP_FIRST_WORD_MASK(start) & bm[0]) ++#else ++#define bmq_find_first_bit(bm) find_first_bit((bm), bmq_BITS) ++#define bmq_find_next_bit(bm, start) find_next_bit(bm, bmq_BITS, start) ++#endif ++ ++static inline void update_sched_rq_watermark(struct rq *rq) ++{ ++ unsigned long watermark = bmq_find_first_bit(rq->queue.bitmap); ++ unsigned long last_wm = rq->watermark; ++ unsigned long i; ++ int cpu; ++ ++ if (watermark == last_wm) ++ return; ++ ++ rq->watermark = watermark; ++ cpu = cpu_of(rq); ++ if (watermark < last_wm) { ++ for (i = watermark + 1; i <= last_wm; i++) ++ cpumask_andnot(&sched_rq_watermark[i], ++ &sched_rq_watermark[i], cpumask_of(cpu)); ++#ifdef CONFIG_SCHED_SMT ++ if (!static_branch_likely(&sched_smt_present)) ++ return; ++ if (IDLE_WM == last_wm) ++ cpumask_andnot(&sched_sg_idle_mask, ++ &sched_sg_idle_mask, cpu_smt_mask(cpu)); ++#endif ++ return; ++ } ++ /* last_wm < watermark */ ++ for (i = last_wm + 1; i <= watermark; i++) ++ cpumask_set_cpu(cpu, &sched_rq_watermark[i]); ++#ifdef CONFIG_SCHED_SMT ++ if (!static_branch_likely(&sched_smt_present)) ++ return; ++ if (IDLE_WM == watermark) { ++ cpumask_t tmp; ++ cpumask_and(&tmp, cpu_smt_mask(cpu), &sched_rq_watermark[IDLE_WM]); ++ if (cpumask_equal(&tmp, cpu_smt_mask(cpu))) ++ cpumask_or(&sched_sg_idle_mask, cpu_smt_mask(cpu), ++ &sched_sg_idle_mask); ++ } ++#endif ++} ++ ++static inline int task_sched_prio(struct task_struct *p) ++{ ++ return (p->prio < MAX_RT_PRIO)? 0:p->prio - MAX_RT_PRIO + p->boost_prio + 1; ++} ++ ++static inline void bmq_init(struct bmq *q) ++{ ++ int i; ++ ++ bitmap_zero(q->bitmap, bmq_BITS); ++ for(i = 0; i < bmq_BITS; i++) ++ INIT_LIST_HEAD(&q->heads[i]); ++} ++ ++static inline void bmq_init_idle(struct bmq *q, struct task_struct *idle) ++{ ++ INIT_LIST_HEAD(&q->heads[IDLE_TASK_SCHED_PRIO]); ++ list_add(&idle->bmq_node, &q->heads[IDLE_TASK_SCHED_PRIO]); ++ set_bit(IDLE_TASK_SCHED_PRIO, q->bitmap); ++} ++ ++static inline void bmq_add_task(struct task_struct *p, struct bmq *q, int idx) ++{ ++ struct list_head *n; ++ ++ if (likely(idx)) { ++ list_add_tail(&p->bmq_node, &q->heads[idx]); ++ return; ++ } ++ ++ list_for_each(n, &q->heads[idx]) ++ if (list_entry(n, struct task_struct, bmq_node)->prio > p->prio) ++ break; ++ __list_add(&p->bmq_node, n->prev, n); ++} ++ ++/* ++ * This routine used in bmq scheduler only which assume the idle task in the bmq ++ */ ++static inline struct task_struct *rq_first_bmq_task(struct rq *rq) ++{ ++ unsigned long idx = bmq_find_first_bit(rq->queue.bitmap); ++ const struct list_head *head = &rq->queue.heads[idx]; ++ ++ return list_first_entry(head, struct task_struct, bmq_node); ++} ++ ++static inline struct task_struct * ++rq_next_bmq_task(struct task_struct *p, struct rq *rq) ++{ ++ unsigned long idx = p->bmq_idx; ++ struct list_head *head = &rq->queue.heads[idx]; ++ ++ if (list_is_last(&p->bmq_node, head)) { ++ idx = bmq_find_next_bit(rq->queue.bitmap, idx + 1); ++ head = &rq->queue.heads[idx]; ++ ++ return list_first_entry(head, struct task_struct, bmq_node); ++ } ++ ++ return list_next_entry(p, bmq_node); ++} ++ ++static inline struct task_struct *rq_runnable_task(struct rq *rq) ++{ ++ struct task_struct *next = rq_first_bmq_task(rq); ++ ++ if (unlikely(next == rq->skip)) ++ next = rq_next_bmq_task(next, rq); ++ ++ return next; ++} ++ ++/* ++ * Context: p->pi_lock ++ */ ++static inline struct rq ++*__task_access_lock(struct task_struct *p, raw_spinlock_t **plock) ++{ ++ struct rq *rq; ++ for (;;) { ++ rq = task_rq(p); ++ if (p->on_cpu || task_on_rq_queued(p)) { ++ raw_spin_lock(&rq->lock); ++ if (likely((p->on_cpu || task_on_rq_queued(p)) ++ && rq == task_rq(p))) { ++ *plock = &rq->lock; ++ return rq; ++ } ++ raw_spin_unlock(&rq->lock); ++ } else if (task_on_rq_migrating(p)) { ++ do { ++ cpu_relax(); ++ } while (unlikely(task_on_rq_migrating(p))); ++ } else { ++ *plock = NULL; ++ return rq; ++ } ++ } ++} ++ ++static inline void ++__task_access_unlock(struct task_struct *p, raw_spinlock_t *lock) ++{ ++ if (NULL != lock) ++ raw_spin_unlock(lock); ++} ++ ++static inline struct rq ++*task_access_lock_irqsave(struct task_struct *p, raw_spinlock_t **plock, ++ unsigned long *flags) ++{ ++ struct rq *rq; ++ for (;;) { ++ rq = task_rq(p); ++ if (p->on_cpu || task_on_rq_queued(p)) { ++ raw_spin_lock_irqsave(&rq->lock, *flags); ++ if (likely((p->on_cpu || task_on_rq_queued(p)) ++ && rq == task_rq(p))) { ++ *plock = &rq->lock; ++ return rq; ++ } ++ raw_spin_unlock_irqrestore(&rq->lock, *flags); ++ } else if (task_on_rq_migrating(p)) { ++ do { ++ cpu_relax(); ++ } while (unlikely(task_on_rq_migrating(p))); ++ } else { ++ raw_spin_lock_irqsave(&p->pi_lock, *flags); ++ if (likely(!p->on_cpu && !p->on_rq && ++ rq == task_rq(p))) { ++ *plock = &p->pi_lock; ++ return rq; ++ } ++ raw_spin_unlock_irqrestore(&p->pi_lock, *flags); ++ } ++ } ++} ++ ++static inline void ++task_access_unlock_irqrestore(struct task_struct *p, raw_spinlock_t *lock, ++ unsigned long *flags) ++{ ++ raw_spin_unlock_irqrestore(lock, *flags); ++} ++ ++/* ++ * __task_rq_lock - lock the rq @p resides on. ++ */ ++struct rq *__task_rq_lock(struct task_struct *p, struct rq_flags *rf) ++ __acquires(rq->lock) ++{ ++ struct rq *rq; ++ ++ lockdep_assert_held(&p->pi_lock); ++ ++ for (;;) { ++ rq = task_rq(p); ++ raw_spin_lock(&rq->lock); ++ if (likely(rq == task_rq(p) && !task_on_rq_migrating(p))) ++ return rq; ++ raw_spin_unlock(&rq->lock); ++ ++ while (unlikely(task_on_rq_migrating(p))) ++ cpu_relax(); ++ } ++} ++ ++/* ++ * task_rq_lock - lock p->pi_lock and lock the rq @p resides on. ++ */ ++struct rq *task_rq_lock(struct task_struct *p, struct rq_flags *rf) ++ __acquires(p->pi_lock) ++ __acquires(rq->lock) ++{ ++ struct rq *rq; ++ ++ for (;;) { ++ raw_spin_lock_irqsave(&p->pi_lock, rf->flags); ++ rq = task_rq(p); ++ raw_spin_lock(&rq->lock); ++ /* ++ * move_queued_task() task_rq_lock() ++ * ++ * ACQUIRE (rq->lock) ++ * [S] ->on_rq = MIGRATING [L] rq = task_rq() ++ * WMB (__set_task_cpu()) ACQUIRE (rq->lock); ++ * [S] ->cpu = new_cpu [L] task_rq() ++ * [L] ->on_rq ++ * RELEASE (rq->lock) ++ * ++ * If we observe the old CPU in task_rq_lock(), the acquire of ++ * the old rq->lock will fully serialize against the stores. ++ * ++ * If we observe the new CPU in task_rq_lock(), the address ++ * dependency headed by '[L] rq = task_rq()' and the acquire ++ * will pair with the WMB to ensure we then also see migrating. ++ */ ++ if (likely(rq == task_rq(p) && !task_on_rq_migrating(p))) { ++ return rq; ++ } ++ raw_spin_unlock(&rq->lock); ++ raw_spin_unlock_irqrestore(&p->pi_lock, rf->flags); ++ ++ while (unlikely(task_on_rq_migrating(p))) ++ cpu_relax(); ++ } ++} ++ ++/* ++ * RQ-clock updating methods: ++ */ ++ ++static void update_rq_clock_task(struct rq *rq, s64 delta) ++{ ++/* ++ * In theory, the compile should just see 0 here, and optimize out the call ++ * to sched_rt_avg_update. But I don't trust it... ++ */ ++ s64 __maybe_unused steal = 0, irq_delta = 0; ++ ++#ifdef CONFIG_IRQ_TIME_ACCOUNTING ++ irq_delta = irq_time_read(cpu_of(rq)) - rq->prev_irq_time; ++ ++ /* ++ * Since irq_time is only updated on {soft,}irq_exit, we might run into ++ * this case when a previous update_rq_clock() happened inside a ++ * {soft,}irq region. ++ * ++ * When this happens, we stop ->clock_task and only update the ++ * prev_irq_time stamp to account for the part that fit, so that a next ++ * update will consume the rest. This ensures ->clock_task is ++ * monotonic. ++ * ++ * It does however cause some slight miss-attribution of {soft,}irq ++ * time, a more accurate solution would be to update the irq_time using ++ * the current rq->clock timestamp, except that would require using ++ * atomic ops. ++ */ ++ if (irq_delta > delta) ++ irq_delta = delta; ++ ++ rq->prev_irq_time += irq_delta; ++ delta -= irq_delta; ++#endif ++#ifdef CONFIG_PARAVIRT_TIME_ACCOUNTING ++ if (static_key_false((¶virt_steal_rq_enabled))) { ++ steal = paravirt_steal_clock(cpu_of(rq)); ++ steal -= rq->prev_steal_time_rq; ++ ++ if (unlikely(steal > delta)) ++ steal = delta; ++ ++ rq->prev_steal_time_rq += steal; ++ delta -= steal; ++ } ++#endif ++ ++ rq->clock_task += delta; ++ ++#ifdef CONFIG_HAVE_SCHED_AVG_IRQ ++ if ((irq_delta + steal)) ++ update_irq_load_avg(rq, irq_delta + steal); ++#endif ++} ++ ++static inline void update_rq_clock(struct rq *rq) ++{ ++ s64 delta = sched_clock_cpu(cpu_of(rq)) - rq->clock; ++ ++ if (unlikely(delta <= 0)) ++ return; ++ rq->clock += delta; ++ update_rq_clock_task(rq, delta); ++} ++ ++#ifdef CONFIG_NO_HZ_FULL ++/* ++ * Tick may be needed by tasks in the runqueue depending on their policy and ++ * requirements. If tick is needed, lets send the target an IPI to kick it out ++ * of nohz mode if necessary. ++ */ ++static inline void sched_update_tick_dependency(struct rq *rq) ++{ ++ int cpu; ++ ++ if (!tick_nohz_full_enabled()) ++ return; ++ ++ cpu = cpu_of(rq); ++ ++ if (!tick_nohz_full_cpu(cpu)) ++ return; ++ ++ if (rq->nr_running < 2) ++ tick_nohz_dep_clear_cpu(cpu, TICK_DEP_BIT_SCHED); ++ else ++ tick_nohz_dep_set_cpu(cpu, TICK_DEP_BIT_SCHED); ++} ++#else /* !CONFIG_NO_HZ_FULL */ ++static inline void sched_update_tick_dependency(struct rq *rq) { } ++#endif ++ ++/* ++ * Add/Remove/Requeue task to/from the runqueue routines ++ * Context: rq->lock ++ */ ++static inline void dequeue_task(struct task_struct *p, struct rq *rq, int flags) ++{ ++ lockdep_assert_held(&rq->lock); ++ ++ WARN_ONCE(task_rq(p) != rq, "bmq: dequeue task reside on cpu%d from cpu%d\n", ++ task_cpu(p), cpu_of(rq)); ++ ++ list_del(&p->bmq_node); ++ if (list_empty(&rq->queue.heads[p->bmq_idx])) { ++ clear_bit(p->bmq_idx, rq->queue.bitmap); ++ update_sched_rq_watermark(rq); ++ } ++ --rq->nr_running; ++#ifdef CONFIG_SMP ++ if (1 == rq->nr_running) ++ cpumask_clear_cpu(cpu_of(rq), &sched_rq_pending_mask); ++#endif ++ ++ sched_update_tick_dependency(rq); ++ psi_dequeue(p, flags & DEQUEUE_SLEEP); ++ ++ sched_info_dequeued(rq, p); ++} ++ ++static inline void enqueue_task(struct task_struct *p, struct rq *rq, int flags) ++{ ++ lockdep_assert_held(&rq->lock); ++ ++ WARN_ONCE(task_rq(p) != rq, "bmq: enqueue task reside on cpu%d to cpu%d\n", ++ task_cpu(p), cpu_of(rq)); ++ ++ p->bmq_idx = task_sched_prio(p); ++ bmq_add_task(p, &rq->queue, p->bmq_idx); ++ set_bit(p->bmq_idx, rq->queue.bitmap); ++ update_sched_rq_watermark(rq); ++ ++rq->nr_running; ++#ifdef CONFIG_SMP ++ if (2 == rq->nr_running) ++ cpumask_set_cpu(cpu_of(rq), &sched_rq_pending_mask); ++#endif ++ ++ sched_update_tick_dependency(rq); ++ ++ sched_info_queued(rq, p); ++ psi_enqueue(p, flags); ++ ++ /* ++ * If in_iowait is set, the code below may not trigger any cpufreq ++ * utilization updates, so do it here explicitly with the IOWAIT flag ++ * passed. ++ */ ++ if (p->in_iowait) ++ cpufreq_update_util(rq, SCHED_CPUFREQ_IOWAIT); ++} ++ ++static inline void requeue_task(struct task_struct *p, struct rq *rq) ++{ ++ int idx = task_sched_prio(p); ++ ++ lockdep_assert_held(&rq->lock); ++ WARN_ONCE(task_rq(p) != rq, "bmq: cpu[%d] requeue task reside on cpu%d\n", ++ cpu_of(rq), task_cpu(p)); ++ ++ list_del(&p->bmq_node); ++ bmq_add_task(p, &rq->queue, idx); ++ if (idx != p->bmq_idx) { ++ if (list_empty(&rq->queue.heads[p->bmq_idx])) ++ clear_bit(p->bmq_idx, rq->queue.bitmap); ++ p->bmq_idx = idx; ++ set_bit(p->bmq_idx, rq->queue.bitmap); ++ update_sched_rq_watermark(rq); ++ } ++} ++ ++/* ++ * cmpxchg based fetch_or, macro so it works for different integer types ++ */ ++#define fetch_or(ptr, mask) \ ++ ({ \ ++ typeof(ptr) _ptr = (ptr); \ ++ typeof(mask) _mask = (mask); \ ++ typeof(*_ptr) _old, _val = *_ptr; \ ++ \ ++ for (;;) { \ ++ _old = cmpxchg(_ptr, _val, _val | _mask); \ ++ if (_old == _val) \ ++ break; \ ++ _val = _old; \ ++ } \ ++ _old; \ ++}) ++ ++#if defined(CONFIG_SMP) && defined(TIF_POLLING_NRFLAG) ++/* ++ * Atomically set TIF_NEED_RESCHED and test for TIF_POLLING_NRFLAG, ++ * this avoids any races wrt polling state changes and thereby avoids ++ * spurious IPIs. ++ */ ++static bool set_nr_and_not_polling(struct task_struct *p) ++{ ++ struct thread_info *ti = task_thread_info(p); ++ return !(fetch_or(&ti->flags, _TIF_NEED_RESCHED) & _TIF_POLLING_NRFLAG); ++} ++ ++/* ++ * Atomically set TIF_NEED_RESCHED if TIF_POLLING_NRFLAG is set. ++ * ++ * If this returns true, then the idle task promises to call ++ * sched_ttwu_pending() and reschedule soon. ++ */ ++static bool set_nr_if_polling(struct task_struct *p) ++{ ++ struct thread_info *ti = task_thread_info(p); ++ typeof(ti->flags) old, val = READ_ONCE(ti->flags); ++ ++ for (;;) { ++ if (!(val & _TIF_POLLING_NRFLAG)) ++ return false; ++ if (val & _TIF_NEED_RESCHED) ++ return true; ++ old = cmpxchg(&ti->flags, val, val | _TIF_NEED_RESCHED); ++ if (old == val) ++ break; ++ val = old; ++ } ++ return true; ++} ++ ++#else ++static bool set_nr_and_not_polling(struct task_struct *p) ++{ ++ set_tsk_need_resched(p); ++ return true; ++} ++ ++#ifdef CONFIG_SMP ++static bool set_nr_if_polling(struct task_struct *p) ++{ ++ return false; ++} ++#endif ++#endif ++ ++static bool __wake_q_add(struct wake_q_head *head, struct task_struct *task) ++{ ++ struct wake_q_node *node = &task->wake_q; ++ ++ /* ++ * Atomically grab the task, if ->wake_q is !nil already it means ++ * its already queued (either by us or someone else) and will get the ++ * wakeup due to that. ++ * ++ * In order to ensure that a pending wakeup will observe our pending ++ * state, even in the failed case, an explicit smp_mb() must be used. ++ */ ++ smp_mb__before_atomic(); ++ if (unlikely(cmpxchg_relaxed(&node->next, NULL, WAKE_Q_TAIL))) ++ return false; ++ ++ /* ++ * The head is context local, there can be no concurrency. ++ */ ++ *head->lastp = node; ++ head->lastp = &node->next; ++ return true; ++} ++ ++/** ++ * wake_q_add() - queue a wakeup for 'later' waking. ++ * @head: the wake_q_head to add @task to ++ * @task: the task to queue for 'later' wakeup ++ * ++ * Queue a task for later wakeup, most likely by the wake_up_q() call in the ++ * same context, _HOWEVER_ this is not guaranteed, the wakeup can come ++ * instantly. ++ * ++ * This function must be used as-if it were wake_up_process(); IOW the task ++ * must be ready to be woken at this location. ++ */ ++void wake_q_add(struct wake_q_head *head, struct task_struct *task) ++{ ++ if (__wake_q_add(head, task)) ++ get_task_struct(task); ++} ++ ++/** ++ * wake_q_add_safe() - safely queue a wakeup for 'later' waking. ++ * @head: the wake_q_head to add @task to ++ * @task: the task to queue for 'later' wakeup ++ * ++ * Queue a task for later wakeup, most likely by the wake_up_q() call in the ++ * same context, _HOWEVER_ this is not guaranteed, the wakeup can come ++ * instantly. ++ * ++ * This function must be used as-if it were wake_up_process(); IOW the task ++ * must be ready to be woken at this location. ++ * ++ * This function is essentially a task-safe equivalent to wake_q_add(). Callers ++ * that already hold reference to @task can call the 'safe' version and trust ++ * wake_q to do the right thing depending whether or not the @task is already ++ * queued for wakeup. ++ */ ++void wake_q_add_safe(struct wake_q_head *head, struct task_struct *task) ++{ ++ if (!__wake_q_add(head, task)) ++ put_task_struct(task); ++} ++ ++void wake_up_q(struct wake_q_head *head) ++{ ++ struct wake_q_node *node = head->first; ++ ++ while (node != WAKE_Q_TAIL) { ++ struct task_struct *task; ++ ++ task = container_of(node, struct task_struct, wake_q); ++ BUG_ON(!task); ++ /* task can safely be re-inserted now: */ ++ node = node->next; ++ task->wake_q.next = NULL; ++ ++ /* ++ * wake_up_process() executes a full barrier, which pairs with ++ * the queueing in wake_q_add() so as not to miss wakeups. ++ */ ++ wake_up_process(task); ++ put_task_struct(task); ++ } ++} ++ ++/* ++ * resched_curr - mark rq's current task 'to be rescheduled now'. ++ * ++ * On UP this means the setting of the need_resched flag, on SMP it ++ * might also involve a cross-CPU call to trigger the scheduler on ++ * the target CPU. ++ */ ++void resched_curr(struct rq *rq) ++{ ++ struct task_struct *curr = rq->curr; ++ int cpu; ++ ++ lockdep_assert_held(&rq->lock); ++ ++ if (test_tsk_need_resched(curr)) ++ return; ++ ++ cpu = cpu_of(rq); ++ if (cpu == smp_processor_id()) { ++ set_tsk_need_resched(curr); ++ set_preempt_need_resched(); ++ return; ++ } ++ ++ if (set_nr_and_not_polling(curr)) ++ smp_send_reschedule(cpu); ++ else ++ trace_sched_wake_idle_without_ipi(cpu); ++} ++ ++void resched_cpu(int cpu) ++{ ++ struct rq *rq = cpu_rq(cpu); ++ unsigned long flags; ++ ++ raw_spin_lock_irqsave(&rq->lock, flags); ++ if (cpu_online(cpu) || cpu == smp_processor_id()) ++ resched_curr(cpu_rq(cpu)); ++ raw_spin_unlock_irqrestore(&rq->lock, flags); ++} ++ ++#ifdef CONFIG_SMP ++#ifdef CONFIG_NO_HZ_COMMON ++void nohz_balance_enter_idle(int cpu) ++{ ++} ++ ++void select_nohz_load_balancer(int stop_tick) ++{ ++} ++ ++void set_cpu_sd_state_idle(void) {} ++ ++/* ++ * In the semi idle case, use the nearest busy CPU for migrating timers ++ * from an idle CPU. This is good for power-savings. ++ * ++ * We don't do similar optimization for completely idle system, as ++ * selecting an idle CPU will add more delays to the timers than intended ++ * (as that CPU's timer base may not be uptodate wrt jiffies etc). ++ */ ++int get_nohz_timer_target(void) ++{ ++ int i, cpu = smp_processor_id(); ++ struct cpumask *mask; ++ ++ if (!idle_cpu(cpu) && housekeeping_cpu(cpu, HK_FLAG_TIMER)) ++ return cpu; ++ ++ for (mask = &(per_cpu(sched_cpu_affinity_masks, cpu)[0]); ++ mask < per_cpu(sched_cpu_affinity_end_mask, cpu); mask++) ++ for_each_cpu(i, mask) ++ if (!idle_cpu(i) && housekeeping_cpu(i, HK_FLAG_TIMER)) ++ return i; ++ ++ if (!housekeeping_cpu(cpu, HK_FLAG_TIMER)) ++ cpu = housekeeping_any_cpu(HK_FLAG_TIMER); ++ ++ return cpu; ++} ++ ++/* ++ * When add_timer_on() enqueues a timer into the timer wheel of an ++ * idle CPU then this timer might expire before the next timer event ++ * which is scheduled to wake up that CPU. In case of a completely ++ * idle system the next event might even be infinite time into the ++ * future. wake_up_idle_cpu() ensures that the CPU is woken up and ++ * leaves the inner idle loop so the newly added timer is taken into ++ * account when the CPU goes back to idle and evaluates the timer ++ * wheel for the next timer event. ++ */ ++static inline void wake_up_idle_cpu(int cpu) ++{ ++ if (cpu == smp_processor_id()) ++ return; ++ ++ set_tsk_need_resched(cpu_rq(cpu)->idle); ++ smp_send_reschedule(cpu); ++} ++ ++static inline bool wake_up_full_nohz_cpu(int cpu) ++{ ++ /* ++ * We just need the target to call irq_exit() and re-evaluate ++ * the next tick. The nohz full kick at least implies that. ++ * If needed we can still optimize that later with an ++ * empty IRQ. ++ */ ++ if (tick_nohz_full_cpu(cpu)) { ++ if (cpu != smp_processor_id() || ++ tick_nohz_tick_stopped()) ++ tick_nohz_full_kick_cpu(cpu); ++ return true; ++ } ++ ++ return false; ++} ++ ++void wake_up_nohz_cpu(int cpu) ++{ ++ if (cpu_online(cpu) && !wake_up_full_nohz_cpu(cpu)) ++ wake_up_idle_cpu(cpu); ++} ++ ++#endif /* CONFIG_NO_HZ_COMMON */ ++#endif /* CONFIG_SMP */ ++ ++static inline void check_preempt_curr(struct rq *rq) ++{ ++ if (rq_first_bmq_task(rq) != rq->curr) ++ resched_curr(rq); ++} ++ ++#ifdef CONFIG_SCHED_HRTICK ++/* ++ * Use HR-timers to deliver accurate preemption points. ++ */ ++ ++static void hrtick_clear(struct rq *rq) ++{ ++ if (hrtimer_active(&rq->hrtick_timer)) ++ hrtimer_cancel(&rq->hrtick_timer); ++} ++ ++/* ++ * High-resolution timer tick. ++ * Runs from hardirq context with interrupts disabled. ++ */ ++static enum hrtimer_restart hrtick(struct hrtimer *timer) ++{ ++ struct rq *rq = container_of(timer, struct rq, hrtick_timer); ++ struct task_struct *p; ++ ++ WARN_ON_ONCE(cpu_of(rq) != smp_processor_id()); ++ ++ raw_spin_lock(&rq->lock); ++ p = rq->curr; ++ p->time_slice = 0; ++ resched_curr(rq); ++ raw_spin_unlock(&rq->lock); ++ ++ return HRTIMER_NORESTART; ++} ++ ++/* ++ * Use hrtick when: ++ * - enabled by features ++ * - hrtimer is actually high res ++ */ ++static inline int hrtick_enabled(struct rq *rq) ++{ ++ /** ++ * BMQ doesn't support sched_feat yet ++ if (!sched_feat(HRTICK)) ++ return 0; ++ */ ++ if (!cpu_active(cpu_of(rq))) ++ return 0; ++ return hrtimer_is_hres_active(&rq->hrtick_timer); ++} ++ ++#ifdef CONFIG_SMP ++ ++static void __hrtick_restart(struct rq *rq) ++{ ++ struct hrtimer *timer = &rq->hrtick_timer; ++ ++ hrtimer_start_expires(timer, HRTIMER_MODE_ABS_PINNED_HARD); ++} ++ ++/* ++ * called from hardirq (IPI) context ++ */ ++static void __hrtick_start(void *arg) ++{ ++ struct rq *rq = arg; ++ ++ raw_spin_lock(&rq->lock); ++ __hrtick_restart(rq); ++ rq->hrtick_csd_pending = 0; ++ raw_spin_unlock(&rq->lock); ++} ++ ++/* ++ * Called to set the hrtick timer state. ++ * ++ * called with rq->lock held and irqs disabled ++ */ ++void hrtick_start(struct rq *rq, u64 delay) ++{ ++ struct hrtimer *timer = &rq->hrtick_timer; ++ ktime_t time; ++ s64 delta; ++ ++ /* ++ * Don't schedule slices shorter than 10000ns, that just ++ * doesn't make sense and can cause timer DoS. ++ */ ++ delta = max_t(s64, delay, 10000LL); ++ time = ktime_add_ns(timer->base->get_time(), delta); ++ ++ hrtimer_set_expires(timer, time); ++ ++ if (rq == this_rq()) { ++ __hrtick_restart(rq); ++ } else if (!rq->hrtick_csd_pending) { ++ smp_call_function_single_async(cpu_of(rq), &rq->hrtick_csd); ++ rq->hrtick_csd_pending = 1; ++ } ++} ++ ++#else ++/* ++ * Called to set the hrtick timer state. ++ * ++ * called with rq->lock held and irqs disabled ++ */ ++void hrtick_start(struct rq *rq, u64 delay) ++{ ++ /* ++ * Don't schedule slices shorter than 10000ns, that just ++ * doesn't make sense. Rely on vruntime for fairness. ++ */ ++ delay = max_t(u64, delay, 10000LL); ++ hrtimer_start(&rq->hrtick_timer, ns_to_ktime(delay), ++ HRTIMER_MODE_REL_PINNED_HARD); ++} ++#endif /* CONFIG_SMP */ ++ ++static void hrtick_rq_init(struct rq *rq) ++{ ++#ifdef CONFIG_SMP ++ rq->hrtick_csd_pending = 0; ++ ++ rq->hrtick_csd.flags = 0; ++ rq->hrtick_csd.func = __hrtick_start; ++ rq->hrtick_csd.info = rq; ++#endif ++ ++ hrtimer_init(&rq->hrtick_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL_HARD); ++ rq->hrtick_timer.function = hrtick; ++} ++#else /* CONFIG_SCHED_HRTICK */ ++static inline int hrtick_enabled(struct rq *rq) ++{ ++ return 0; ++} ++ ++static inline void hrtick_clear(struct rq *rq) ++{ ++} ++ ++static inline void hrtick_rq_init(struct rq *rq) ++{ ++} ++#endif /* CONFIG_SCHED_HRTICK */ ++ ++static inline int normal_prio(struct task_struct *p) ++{ ++ if (task_has_rt_policy(p)) ++ return MAX_RT_PRIO - 1 - p->rt_priority; ++ ++ return p->static_prio + MAX_PRIORITY_ADJ; ++} ++ ++/* ++ * Calculate the current priority, i.e. the priority ++ * taken into account by the scheduler. This value might ++ * be boosted by RT tasks as it will be RT if the task got ++ * RT-boosted. If not then it returns p->normal_prio. ++ */ ++static int effective_prio(struct task_struct *p) ++{ ++ p->normal_prio = normal_prio(p); ++ /* ++ * If we are RT tasks or we were boosted to RT priority, ++ * keep the priority unchanged. Otherwise, update priority ++ * to the normal priority: ++ */ ++ if (!rt_prio(p->prio)) ++ return p->normal_prio; ++ return p->prio; ++} ++ ++/* ++ * activate_task - move a task to the runqueue. ++ * ++ * Context: rq->lock ++ */ ++static void activate_task(struct task_struct *p, struct rq *rq) ++{ ++ if (task_contributes_to_load(p)) ++ rq->nr_uninterruptible--; ++ enqueue_task(p, rq, ENQUEUE_WAKEUP); ++ p->on_rq = TASK_ON_RQ_QUEUED; ++ cpufreq_update_util(rq, 0); ++} ++ ++/* ++ * deactivate_task - remove a task from the runqueue. ++ * ++ * Context: rq->lock ++ */ ++static inline void deactivate_task(struct task_struct *p, struct rq *rq) ++{ ++ if (task_contributes_to_load(p)) ++ rq->nr_uninterruptible++; ++ dequeue_task(p, rq, DEQUEUE_SLEEP); ++ p->on_rq = 0; ++ cpufreq_update_util(rq, 0); ++} ++ ++static inline void __set_task_cpu(struct task_struct *p, unsigned int cpu) ++{ ++#ifdef CONFIG_SMP ++ /* ++ * After ->cpu is set up to a new value, task_access_lock(p, ...) can be ++ * successfully executed on another CPU. We must ensure that updates of ++ * per-task data have been completed by this moment. ++ */ ++ smp_wmb(); ++ ++#ifdef CONFIG_THREAD_INFO_IN_TASK ++ WRITE_ONCE(p->cpu, cpu); ++#else ++ WRITE_ONCE(task_thread_info(p)->cpu, cpu); ++#endif ++#endif ++} ++ ++#ifdef CONFIG_SMP ++void set_task_cpu(struct task_struct *p, unsigned int new_cpu) ++{ ++#ifdef CONFIG_SCHED_DEBUG ++ /* ++ * We should never call set_task_cpu() on a blocked task, ++ * ttwu() will sort out the placement. ++ */ ++ WARN_ON_ONCE(p->state != TASK_RUNNING && p->state != TASK_WAKING && ++ !p->on_rq); ++#ifdef CONFIG_LOCKDEP ++ /* ++ * The caller should hold either p->pi_lock or rq->lock, when changing ++ * a task's CPU. ->pi_lock for waking tasks, rq->lock for runnable tasks. ++ * ++ * sched_move_task() holds both and thus holding either pins the cgroup, ++ * see task_group(). ++ */ ++ WARN_ON_ONCE(debug_locks && !(lockdep_is_held(&p->pi_lock) || ++ lockdep_is_held(&task_rq(p)->lock))); ++#endif ++ /* ++ * Clearly, migrating tasks to offline CPUs is a fairly daft thing. ++ */ ++ WARN_ON_ONCE(!cpu_online(new_cpu)); ++#endif ++ if (task_cpu(p) == new_cpu) ++ return; ++ trace_sched_migrate_task(p, new_cpu); ++ rseq_migrate(p); ++ perf_event_task_migrate(p); ++ ++ __set_task_cpu(p, new_cpu); ++} ++ ++static inline bool is_per_cpu_kthread(struct task_struct *p) ++{ ++ return ((p->flags & PF_KTHREAD) && (1 == p->nr_cpus_allowed)); ++} ++ ++/* ++ * Per-CPU kthreads are allowed to run on !active && online CPUs, see ++ * __set_cpus_allowed_ptr() and select_fallback_rq(). ++ */ ++static inline bool is_cpu_allowed(struct task_struct *p, int cpu) ++{ ++ if (!cpumask_test_cpu(cpu, p->cpus_ptr)) ++ return false; ++ ++ if (is_per_cpu_kthread(p)) ++ return cpu_online(cpu); ++ ++ return cpu_active(cpu); ++} ++ ++/* ++ * This is how migration works: ++ * ++ * 1) we invoke migration_cpu_stop() on the target CPU using ++ * stop_one_cpu(). ++ * 2) stopper starts to run (implicitly forcing the migrated thread ++ * off the CPU) ++ * 3) it checks whether the migrated task is still in the wrong runqueue. ++ * 4) if it's in the wrong runqueue then the migration thread removes ++ * it and puts it into the right queue. ++ * 5) stopper completes and stop_one_cpu() returns and the migration ++ * is done. ++ */ ++ ++/* ++ * move_queued_task - move a queued task to new rq. ++ * ++ * Returns (locked) new rq. Old rq's lock is released. ++ */ ++static struct rq *move_queued_task(struct rq *rq, struct task_struct *p, int ++ new_cpu) ++{ ++ lockdep_assert_held(&rq->lock); ++ ++ WRITE_ONCE(p->on_rq, TASK_ON_RQ_MIGRATING); ++ dequeue_task(p, rq, 0); ++ set_task_cpu(p, new_cpu); ++ raw_spin_unlock(&rq->lock); ++ ++ rq = cpu_rq(new_cpu); ++ ++ raw_spin_lock(&rq->lock); ++ BUG_ON(task_cpu(p) != new_cpu); ++ enqueue_task(p, rq, 0); ++ p->on_rq = TASK_ON_RQ_QUEUED; ++ check_preempt_curr(rq); ++ ++ return rq; ++} ++ ++struct migration_arg { ++ struct task_struct *task; ++ int dest_cpu; ++}; ++ ++/* ++ * Move (not current) task off this CPU, onto the destination CPU. We're doing ++ * this because either it can't run here any more (set_cpus_allowed() ++ * away from this CPU, or CPU going down), or because we're ++ * attempting to rebalance this task on exec (sched_exec). ++ * ++ * So we race with normal scheduler movements, but that's OK, as long ++ * as the task is no longer on this CPU. ++ */ ++static struct rq *__migrate_task(struct rq *rq, struct task_struct *p, int ++ dest_cpu) ++{ ++ /* Affinity changed (again). */ ++ if (!is_cpu_allowed(p, dest_cpu)) ++ return rq; ++ ++ update_rq_clock(rq); ++ return move_queued_task(rq, p, dest_cpu); ++} ++ ++/* ++ * migration_cpu_stop - this will be executed by a highprio stopper thread ++ * and performs thread migration by bumping thread off CPU then ++ * 'pushing' onto another runqueue. ++ */ ++static int migration_cpu_stop(void *data) ++{ ++ struct migration_arg *arg = data; ++ struct task_struct *p = arg->task; ++ struct rq *rq = this_rq(); ++ ++ /* ++ * The original target CPU might have gone down and we might ++ * be on another CPU but it doesn't matter. ++ */ ++ local_irq_disable(); ++ ++ raw_spin_lock(&p->pi_lock); ++ raw_spin_lock(&rq->lock); ++ /* ++ * If task_rq(p) != rq, it cannot be migrated here, because we're ++ * holding rq->lock, if p->on_rq == 0 it cannot get enqueued because ++ * we're holding p->pi_lock. ++ */ ++ if (task_rq(p) == rq && task_on_rq_queued(p)) ++ rq = __migrate_task(rq, p, arg->dest_cpu); ++ raw_spin_unlock(&rq->lock); ++ raw_spin_unlock(&p->pi_lock); ++ ++ local_irq_enable(); ++ return 0; ++} ++ ++static inline void ++set_cpus_allowed_common(struct task_struct *p, const struct cpumask *new_mask) ++{ ++ cpumask_copy(&p->cpus_mask, new_mask); ++ p->nr_cpus_allowed = cpumask_weight(new_mask); ++} ++ ++void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask) ++{ ++ set_cpus_allowed_common(p, new_mask); ++} ++#endif ++ ++/** ++ * task_curr - is this task currently executing on a CPU? ++ * @p: the task in question. ++ * ++ * Return: 1 if the task is currently executing. 0 otherwise. ++ */ ++inline int task_curr(const struct task_struct *p) ++{ ++ return cpu_curr(task_cpu(p)) == p; ++} ++ ++#ifdef CONFIG_SMP ++/* ++ * wait_task_inactive - wait for a thread to unschedule. ++ * ++ * If @match_state is nonzero, it's the @p->state value just checked and ++ * not expected to change. If it changes, i.e. @p might have woken up, ++ * then return zero. When we succeed in waiting for @p to be off its CPU, ++ * we return a positive number (its total switch count). If a second call ++ * a short while later returns the same number, the caller can be sure that ++ * @p has remained unscheduled the whole time. ++ * ++ * The caller must ensure that the task *will* unschedule sometime soon, ++ * else this function might spin for a *long* time. This function can't ++ * be called with interrupts off, or it may introduce deadlock with ++ * smp_call_function() if an IPI is sent by the same process we are ++ * waiting to become inactive. ++ */ ++unsigned long wait_task_inactive(struct task_struct *p, long match_state) ++{ ++ unsigned long flags; ++ bool running, on_rq; ++ unsigned long ncsw; ++ struct rq *rq; ++ raw_spinlock_t *lock; ++ ++ for (;;) { ++ rq = task_rq(p); ++ ++ /* ++ * If the task is actively running on another CPU ++ * still, just relax and busy-wait without holding ++ * any locks. ++ * ++ * NOTE! Since we don't hold any locks, it's not ++ * even sure that "rq" stays as the right runqueue! ++ * But we don't care, since this will return false ++ * if the runqueue has changed and p is actually now ++ * running somewhere else! ++ */ ++ while (task_running(p) && p == rq->curr) { ++ if (match_state && unlikely(p->state != match_state)) ++ return 0; ++ cpu_relax(); ++ } ++ ++ /* ++ * Ok, time to look more closely! We need the rq ++ * lock now, to be *sure*. If we're wrong, we'll ++ * just go back and repeat. ++ */ ++ task_access_lock_irqsave(p, &lock, &flags); ++ trace_sched_wait_task(p); ++ running = task_running(p); ++ on_rq = p->on_rq; ++ ncsw = 0; ++ if (!match_state || p->state == match_state) ++ ncsw = p->nvcsw | LONG_MIN; /* sets MSB */ ++ task_access_unlock_irqrestore(p, lock, &flags); ++ ++ /* ++ * If it changed from the expected state, bail out now. ++ */ ++ if (unlikely(!ncsw)) ++ break; ++ ++ /* ++ * Was it really running after all now that we ++ * checked with the proper locks actually held? ++ * ++ * Oops. Go back and try again.. ++ */ ++ if (unlikely(running)) { ++ cpu_relax(); ++ continue; ++ } ++ ++ /* ++ * It's not enough that it's not actively running, ++ * it must be off the runqueue _entirely_, and not ++ * preempted! ++ * ++ * So if it was still runnable (but just not actively ++ * running right now), it's preempted, and we should ++ * yield - it could be a while. ++ */ ++ if (unlikely(on_rq)) { ++ ktime_t to = NSEC_PER_SEC / HZ; ++ ++ set_current_state(TASK_UNINTERRUPTIBLE); ++ schedule_hrtimeout(&to, HRTIMER_MODE_REL); ++ continue; ++ } ++ ++ /* ++ * Ahh, all good. It wasn't running, and it wasn't ++ * runnable, which means that it will never become ++ * running in the future either. We're all done! ++ */ ++ break; ++ } ++ ++ return ncsw; ++} ++ ++/*** ++ * kick_process - kick a running thread to enter/exit the kernel ++ * @p: the to-be-kicked thread ++ * ++ * Cause a process which is running on another CPU to enter ++ * kernel-mode, without any delay. (to get signals handled.) ++ * ++ * NOTE: this function doesn't have to take the runqueue lock, ++ * because all it wants to ensure is that the remote task enters ++ * the kernel. If the IPI races and the task has been migrated ++ * to another CPU then no harm is done and the purpose has been ++ * achieved as well. ++ */ ++void kick_process(struct task_struct *p) ++{ ++ int cpu; ++ ++ preempt_disable(); ++ cpu = task_cpu(p); ++ if ((cpu != smp_processor_id()) && task_curr(p)) ++ smp_send_reschedule(cpu); ++ preempt_enable(); ++} ++EXPORT_SYMBOL_GPL(kick_process); ++ ++/* ++ * ->cpus_ptr is protected by both rq->lock and p->pi_lock ++ * ++ * A few notes on cpu_active vs cpu_online: ++ * ++ * - cpu_active must be a subset of cpu_online ++ * ++ * - on CPU-up we allow per-CPU kthreads on the online && !active CPU, ++ * see __set_cpus_allowed_ptr(). At this point the newly online ++ * CPU isn't yet part of the sched domains, and balancing will not ++ * see it. ++ * ++ * - on cpu-down we clear cpu_active() to mask the sched domains and ++ * avoid the load balancer to place new tasks on the to be removed ++ * CPU. Existing tasks will remain running there and will be taken ++ * off. ++ * ++ * This means that fallback selection must not select !active CPUs. ++ * And can assume that any active CPU must be online. Conversely ++ * select_task_rq() below may allow selection of !active CPUs in order ++ * to satisfy the above rules. ++ */ ++static int select_fallback_rq(int cpu, struct task_struct *p) ++{ ++ int nid = cpu_to_node(cpu); ++ const struct cpumask *nodemask = NULL; ++ enum { cpuset, possible, fail } state = cpuset; ++ int dest_cpu; ++ ++ /* ++ * If the node that the CPU is on has been offlined, cpu_to_node() ++ * will return -1. There is no CPU on the node, and we should ++ * select the CPU on the other node. ++ */ ++ if (nid != -1) { ++ nodemask = cpumask_of_node(nid); ++ ++ /* Look for allowed, online CPU in same node. */ ++ for_each_cpu(dest_cpu, nodemask) { ++ if (!cpu_active(dest_cpu)) ++ continue; ++ if (cpumask_test_cpu(dest_cpu, p->cpus_ptr)) ++ return dest_cpu; ++ } ++ } ++ ++ for (;;) { ++ /* Any allowed, online CPU? */ ++ for_each_cpu(dest_cpu, p->cpus_ptr) { ++ if (!is_cpu_allowed(p, dest_cpu)) ++ continue; ++ goto out; ++ } ++ ++ /* No more Mr. Nice Guy. */ ++ switch (state) { ++ case cpuset: ++ if (IS_ENABLED(CONFIG_CPUSETS)) { ++ cpuset_cpus_allowed_fallback(p); ++ state = possible; ++ break; ++ } ++ /* Fall-through */ ++ case possible: ++ do_set_cpus_allowed(p, cpu_possible_mask); ++ state = fail; ++ break; ++ ++ case fail: ++ BUG(); ++ break; ++ } ++ } ++ ++out: ++ if (state != cpuset) { ++ /* ++ * Don't tell them about moving exiting tasks or ++ * kernel threads (both mm NULL), since they never ++ * leave kernel. ++ */ ++ if (p->mm && printk_ratelimit()) { ++ printk_deferred("process %d (%s) no longer affine to cpu%d\n", ++ task_pid_nr(p), p->comm, cpu); ++ } ++ } ++ ++ return dest_cpu; ++} ++ ++static inline int select_task_rq(struct task_struct *p) ++{ ++ cpumask_t chk_mask, tmp; ++ ++ if (unlikely(!cpumask_and(&chk_mask, p->cpus_ptr, cpu_online_mask))) ++ return select_fallback_rq(task_cpu(p), p); ++ ++ if ( ++#ifdef CONFIG_SCHED_SMT ++ cpumask_and(&tmp, &chk_mask, &sched_sg_idle_mask) || ++#endif ++ cpumask_and(&tmp, &chk_mask, &sched_rq_watermark[IDLE_WM]) || ++ cpumask_and(&tmp, &chk_mask, ++ &sched_rq_watermark[task_sched_prio(p) + 1])) ++ return best_mask_cpu(task_cpu(p), &tmp); ++ ++ return best_mask_cpu(task_cpu(p), &chk_mask); ++} ++ ++void sched_set_stop_task(int cpu, struct task_struct *stop) ++{ ++ struct sched_param stop_param = { .sched_priority = STOP_PRIO }; ++ struct sched_param start_param = { .sched_priority = 0 }; ++ struct task_struct *old_stop = cpu_rq(cpu)->stop; ++ ++ if (stop) { ++ /* ++ * Make it appear like a SCHED_FIFO task, its something ++ * userspace knows about and won't get confused about. ++ * ++ * Also, it will make PI more or less work without too ++ * much confusion -- but then, stop work should not ++ * rely on PI working anyway. ++ */ ++ sched_setscheduler_nocheck(stop, SCHED_FIFO, &stop_param); ++ } ++ ++ cpu_rq(cpu)->stop = stop; ++ ++ if (old_stop) { ++ /* ++ * Reset it back to a normal scheduling policy so that ++ * it can die in pieces. ++ */ ++ sched_setscheduler_nocheck(old_stop, SCHED_NORMAL, &start_param); ++ } ++} ++ ++/* ++ * Change a given task's CPU affinity. Migrate the thread to a ++ * proper CPU and schedule it away if the CPU it's executing on ++ * is removed from the allowed bitmask. ++ * ++ * NOTE: the caller must have a valid reference to the task, the ++ * task must not exit() & deallocate itself prematurely. The ++ * call is not atomic; no spinlocks may be held. ++ */ ++static int __set_cpus_allowed_ptr(struct task_struct *p, ++ const struct cpumask *new_mask, bool check) ++{ ++ const struct cpumask *cpu_valid_mask = cpu_active_mask; ++ int dest_cpu; ++ unsigned long flags; ++ struct rq *rq; ++ raw_spinlock_t *lock; ++ int ret = 0; ++ ++ raw_spin_lock_irqsave(&p->pi_lock, flags); ++ rq = __task_access_lock(p, &lock); ++ ++ if (p->flags & PF_KTHREAD) { ++ /* ++ * Kernel threads are allowed on online && !active CPUs ++ */ ++ cpu_valid_mask = cpu_online_mask; ++ } ++ ++ /* ++ * Must re-check here, to close a race against __kthread_bind(), ++ * sched_setaffinity() is not guaranteed to observe the flag. ++ */ ++ if (check && (p->flags & PF_NO_SETAFFINITY)) { ++ ret = -EINVAL; ++ goto out; ++ } ++ ++ if (cpumask_equal(p->cpus_ptr, new_mask)) ++ goto out; ++ ++ dest_cpu = cpumask_any_and(cpu_valid_mask, new_mask); ++ if (dest_cpu >= nr_cpu_ids) { ++ ret = -EINVAL; ++ goto out; ++ } ++ ++ do_set_cpus_allowed(p, new_mask); ++ ++ if (p->flags & PF_KTHREAD) { ++ /* ++ * For kernel threads that do indeed end up on online && ++ * !active we want to ensure they are strict per-CPU threads. ++ */ ++ WARN_ON(cpumask_intersects(new_mask, cpu_online_mask) && ++ !cpumask_intersects(new_mask, cpu_active_mask) && ++ p->nr_cpus_allowed != 1); ++ } ++ ++ /* Can the task run on the task's current CPU? If so, we're done */ ++ if (cpumask_test_cpu(task_cpu(p), new_mask)) ++ goto out; ++ ++ if (task_running(p) || p->state == TASK_WAKING) { ++ struct migration_arg arg = { p, dest_cpu }; ++ ++ /* Need help from migration thread: drop lock and wait. */ ++ __task_access_unlock(p, lock); ++ raw_spin_unlock_irqrestore(&p->pi_lock, flags); ++ stop_one_cpu(cpu_of(rq), migration_cpu_stop, &arg); ++ return 0; ++ } ++ if (task_on_rq_queued(p)) { ++ /* ++ * OK, since we're going to drop the lock immediately ++ * afterwards anyway. ++ */ ++ update_rq_clock(rq); ++ rq = move_queued_task(rq, p, dest_cpu); ++ lock = &rq->lock; ++ } ++ ++out: ++ __task_access_unlock(p, lock); ++ raw_spin_unlock_irqrestore(&p->pi_lock, flags); ++ ++ return ret; ++} ++ ++int set_cpus_allowed_ptr(struct task_struct *p, const struct cpumask *new_mask) ++{ ++ return __set_cpus_allowed_ptr(p, new_mask, false); ++} ++EXPORT_SYMBOL_GPL(set_cpus_allowed_ptr); ++ ++#else /* CONFIG_SMP */ ++ ++static inline int select_task_rq(struct task_struct *p) ++{ ++ return 0; ++} ++ ++static inline int ++__set_cpus_allowed_ptr(struct task_struct *p, ++ const struct cpumask *new_mask, bool check) ++{ ++ return set_cpus_allowed_ptr(p, new_mask); ++} ++ ++#endif /* CONFIG_SMP */ ++ ++static void ++ttwu_stat(struct task_struct *p, int cpu, int wake_flags) ++{ ++ struct rq *rq; ++ ++ if (!schedstat_enabled()) ++ return; ++ ++ rq= this_rq(); ++ ++#ifdef CONFIG_SMP ++ if (cpu == rq->cpu) ++ __schedstat_inc(rq->ttwu_local); ++ else { ++ /** BMQ ToDo: ++ * How to do ttwu_wake_remote ++ */ ++ } ++#endif /* CONFIG_SMP */ ++ ++ __schedstat_inc(rq->ttwu_count); ++} ++ ++/* ++ * Mark the task runnable and perform wakeup-preemption. ++ */ ++static inline void ++ttwu_do_wakeup(struct rq *rq, struct task_struct *p, int wake_flags) ++{ ++ p->state = TASK_RUNNING; ++ trace_sched_wakeup(p); ++} ++ ++static inline void ++ttwu_do_activate(struct rq *rq, struct task_struct *p, int wake_flags) ++{ ++#ifdef CONFIG_SMP ++ if (p->sched_contributes_to_load) ++ rq->nr_uninterruptible--; ++#endif ++ ++ activate_task(p, rq); ++ ttwu_do_wakeup(rq, p, 0); ++} ++ ++static int ttwu_remote(struct task_struct *p, int wake_flags) ++{ ++ struct rq *rq; ++ raw_spinlock_t *lock; ++ int ret = 0; ++ ++ rq = __task_access_lock(p, &lock); ++ if (task_on_rq_queued(p)) { ++ ttwu_do_wakeup(rq, p, wake_flags); ++ ret = 1; ++ } ++ __task_access_unlock(p, lock); ++ ++ return ret; ++} ++ ++#ifdef CONFIG_SMP ++void scheduler_ipi(void) ++{ ++ /* ++ * Fold TIF_NEED_RESCHED into the preempt_count; anybody setting ++ * TIF_NEED_RESCHED remotely (for the first time) will also send ++ * this IPI. ++ */ ++ preempt_fold_need_resched(); ++ ++ if (!idle_cpu(smp_processor_id()) || need_resched()) ++ return; ++ ++ irq_enter(); ++ irq_exit(); ++} ++ ++void wake_up_if_idle(int cpu) ++{ ++ struct rq *rq = cpu_rq(cpu); ++ unsigned long flags; ++ ++ rcu_read_lock(); ++ ++ if (!is_idle_task(rcu_dereference(rq->curr))) ++ goto out; ++ ++ if (set_nr_if_polling(rq->idle)) { ++ trace_sched_wake_idle_without_ipi(cpu); ++ } else { ++ raw_spin_lock_irqsave(&rq->lock, flags); ++ if (is_idle_task(rq->curr)) ++ smp_send_reschedule(cpu); ++ /* Else CPU is not idle, do nothing here */ ++ raw_spin_unlock_irqrestore(&rq->lock, flags); ++ } ++ ++out: ++ rcu_read_unlock(); ++} ++ ++bool cpus_share_cache(int this_cpu, int that_cpu) ++{ ++ return per_cpu(sd_llc_id, this_cpu) == per_cpu(sd_llc_id, that_cpu); ++} ++#endif /* CONFIG_SMP */ ++ ++static inline void ttwu_queue(struct task_struct *p, int cpu, int wake_flags) ++{ ++ struct rq *rq = cpu_rq(cpu); ++ ++ raw_spin_lock(&rq->lock); ++ update_rq_clock(rq); ++ ttwu_do_activate(rq, p, wake_flags); ++ check_preempt_curr(rq); ++ raw_spin_unlock(&rq->lock); ++} ++ ++/* ++ * Notes on Program-Order guarantees on SMP systems. ++ * ++ * MIGRATION ++ * ++ * The basic program-order guarantee on SMP systems is that when a task [t] ++ * migrates, all its activity on its old CPU [c0] happens-before any subsequent ++ * execution on its new CPU [c1]. ++ * ++ * For migration (of runnable tasks) this is provided by the following means: ++ * ++ * A) UNLOCK of the rq(c0)->lock scheduling out task t ++ * B) migration for t is required to synchronize *both* rq(c0)->lock and ++ * rq(c1)->lock (if not at the same time, then in that order). ++ * C) LOCK of the rq(c1)->lock scheduling in task ++ * ++ * Transitivity guarantees that B happens after A and C after B. ++ * Note: we only require RCpc transitivity. ++ * Note: the CPU doing B need not be c0 or c1 ++ * ++ * Example: ++ * ++ * CPU0 CPU1 CPU2 ++ * ++ * LOCK rq(0)->lock ++ * sched-out X ++ * sched-in Y ++ * UNLOCK rq(0)->lock ++ * ++ * LOCK rq(0)->lock // orders against CPU0 ++ * dequeue X ++ * UNLOCK rq(0)->lock ++ * ++ * LOCK rq(1)->lock ++ * enqueue X ++ * UNLOCK rq(1)->lock ++ * ++ * LOCK rq(1)->lock // orders against CPU2 ++ * sched-out Z ++ * sched-in X ++ * UNLOCK rq(1)->lock ++ * ++ * ++ * BLOCKING -- aka. SLEEP + WAKEUP ++ * ++ * For blocking we (obviously) need to provide the same guarantee as for ++ * migration. However the means are completely different as there is no lock ++ * chain to provide order. Instead we do: ++ * ++ * 1) smp_store_release(X->on_cpu, 0) ++ * 2) smp_cond_load_acquire(!X->on_cpu) ++ * ++ * Example: ++ * ++ * CPU0 (schedule) CPU1 (try_to_wake_up) CPU2 (schedule) ++ * ++ * LOCK rq(0)->lock LOCK X->pi_lock ++ * dequeue X ++ * sched-out X ++ * smp_store_release(X->on_cpu, 0); ++ * ++ * smp_cond_load_acquire(&X->on_cpu, !VAL); ++ * X->state = WAKING ++ * set_task_cpu(X,2) ++ * ++ * LOCK rq(2)->lock ++ * enqueue X ++ * X->state = RUNNING ++ * UNLOCK rq(2)->lock ++ * ++ * LOCK rq(2)->lock // orders against CPU1 ++ * sched-out Z ++ * sched-in X ++ * UNLOCK rq(2)->lock ++ * ++ * UNLOCK X->pi_lock ++ * UNLOCK rq(0)->lock ++ * ++ * ++ * However; for wakeups there is a second guarantee we must provide, namely we ++ * must observe the state that lead to our wakeup. That is, not only must our ++ * task observe its own prior state, it must also observe the stores prior to ++ * its wakeup. ++ * ++ * This means that any means of doing remote wakeups must order the CPU doing ++ * the wakeup against the CPU the task is going to end up running on. This, ++ * however, is already required for the regular Program-Order guarantee above, ++ * since the waking CPU is the one issueing the ACQUIRE (smp_cond_load_acquire). ++ * ++ */ ++ ++/*** ++ * try_to_wake_up - wake up a thread ++ * @p: the thread to be awakened ++ * @state: the mask of task states that can be woken ++ * @wake_flags: wake modifier flags (WF_*) ++ * ++ * Put it on the run-queue if it's not already there. The "current" ++ * thread is always on the run-queue (except when the actual ++ * re-schedule is in progress), and as such you're allowed to do ++ * the simpler "current->state = TASK_RUNNING" to mark yourself ++ * runnable without the overhead of this. ++ * ++ * Return: %true if @p was woken up, %false if it was already running. ++ * or @state didn't match @p's state. ++ */ ++static int try_to_wake_up(struct task_struct *p, unsigned int state, ++ int wake_flags) ++{ ++ unsigned long flags; ++ int cpu, success = 0; ++ ++ preempt_disable(); ++ if (p == current) { ++ /* ++ * We're waking current, this means 'p->on_rq' and 'task_cpu(p) ++ * == smp_processor_id()'. Together this means we can special ++ * case the whole 'p->on_rq && ttwu_remote()' case below ++ * without taking any locks. ++ * ++ * In particular: ++ * - we rely on Program-Order guarantees for all the ordering, ++ * - we're serialized against set_special_state() by virtue of ++ * it disabling IRQs (this allows not taking ->pi_lock). ++ */ ++ if (!(p->state & state)) ++ goto out; ++ ++ success = 1; ++ cpu = task_cpu(p); ++ trace_sched_waking(p); ++ p->state = TASK_RUNNING; ++ trace_sched_wakeup(p); ++ goto out; ++ } ++ ++ /* ++ * If we are going to wake up a thread waiting for CONDITION we ++ * need to ensure that CONDITION=1 done by the caller can not be ++ * reordered with p->state check below. This pairs with mb() in ++ * set_current_state() the waiting thread does. ++ */ ++ raw_spin_lock_irqsave(&p->pi_lock, flags); ++ smp_mb__after_spinlock(); ++ if (!(p->state & state)) ++ goto unlock; ++ ++ trace_sched_waking(p); ++ ++ /* We're going to change ->state: */ ++ success = 1; ++ cpu = task_cpu(p); ++ ++ /* ++ * Ensure we load p->on_rq _after_ p->state, otherwise it would ++ * be possible to, falsely, observe p->on_rq == 0 and get stuck ++ * in smp_cond_load_acquire() below. ++ * ++ * sched_ttwu_pending() try_to_wake_up() ++ * STORE p->on_rq = 1 LOAD p->state ++ * UNLOCK rq->lock ++ * ++ * __schedule() (switch to task 'p') ++ * LOCK rq->lock smp_rmb(); ++ * smp_mb__after_spinlock(); ++ * UNLOCK rq->lock ++ * ++ * [task p] ++ * STORE p->state = UNINTERRUPTIBLE LOAD p->on_rq ++ * ++ * Pairs with the LOCK+smp_mb__after_spinlock() on rq->lock in ++ * __schedule(). See the comment for smp_mb__after_spinlock(). ++ */ ++ smp_rmb(); ++ if (p->on_rq && ttwu_remote(p, wake_flags)) ++ goto unlock; ++ ++#ifdef CONFIG_SMP ++ /* ++ * Ensure we load p->on_cpu _after_ p->on_rq, otherwise it would be ++ * possible to, falsely, observe p->on_cpu == 0. ++ * ++ * One must be running (->on_cpu == 1) in order to remove oneself ++ * from the runqueue. ++ * ++ * __schedule() (switch to task 'p') try_to_wake_up() ++ * STORE p->on_cpu = 1 LOAD p->on_rq ++ * UNLOCK rq->lock ++ * ++ * __schedule() (put 'p' to sleep) ++ * LOCK rq->lock smp_rmb(); ++ * smp_mb__after_spinlock(); ++ * STORE p->on_rq = 0 LOAD p->on_cpu ++ * ++ * Pairs with the LOCK+smp_mb__after_spinlock() on rq->lock in ++ * __schedule(). See the comment for smp_mb__after_spinlock(). ++ */ ++ smp_rmb(); ++ ++ /* ++ * If the owning (remote) CPU is still in the middle of schedule() with ++ * this task as prev, wait until its done referencing the task. ++ * ++ * Pairs with the smp_store_release() in finish_task(). ++ * ++ * This ensures that tasks getting woken will be fully ordered against ++ * their previous state and preserve Program Order. ++ */ ++ smp_cond_load_acquire(&p->on_cpu, !VAL); ++ ++ p->sched_contributes_to_load = !!task_contributes_to_load(p); ++ p->state = TASK_WAKING; ++ ++ if (p->in_iowait) { ++ delayacct_blkio_end(p); ++ atomic_dec(&task_rq(p)->nr_iowait); ++ } ++ ++ if(cpu_rq(smp_processor_id())->clock - p->last_ran > sched_timeslice_ns) ++ boost_task(p); ++ ++ cpu = select_task_rq(p); ++ ++ if (cpu != task_cpu(p)) { ++ wake_flags |= WF_MIGRATED; ++ psi_ttwu_dequeue(p); ++ set_task_cpu(p, cpu); ++ } ++#else /* CONFIG_SMP */ ++ if (p->in_iowait) { ++ delayacct_blkio_end(p); ++ atomic_dec(&task_rq(p)->nr_iowait); ++ } ++#endif /* CONFIG_SMP */ ++ ++ ttwu_queue(p, cpu, wake_flags); ++unlock: ++ raw_spin_unlock_irqrestore(&p->pi_lock, flags); ++out: ++ if (success) ++ ttwu_stat(p, cpu, wake_flags); ++ preempt_enable(); ++ ++ return success; ++} ++ ++/** ++ * wake_up_process - Wake up a specific process ++ * @p: The process to be woken up. ++ * ++ * Attempt to wake up the nominated process and move it to the set of runnable ++ * processes. ++ * ++ * Return: 1 if the process was woken up, 0 if it was already running. ++ * ++ * This function executes a full memory barrier before accessing the task state. ++ */ ++int wake_up_process(struct task_struct *p) ++{ ++ return try_to_wake_up(p, TASK_NORMAL, 0); ++} ++EXPORT_SYMBOL(wake_up_process); ++ ++int wake_up_state(struct task_struct *p, unsigned int state) ++{ ++ return try_to_wake_up(p, state, 0); ++} ++ ++/* ++ * Perform scheduler related setup for a newly forked process p. ++ * p is forked by current. ++ * ++ * __sched_fork() is basic setup used by init_idle() too: ++ */ ++static inline void __sched_fork(unsigned long clone_flags, struct task_struct *p) ++{ ++ p->on_rq = 0; ++ p->on_cpu = 0; ++ p->utime = 0; ++ p->stime = 0; ++ p->sched_time = 0; ++ ++#ifdef CONFIG_PREEMPT_NOTIFIERS ++ INIT_HLIST_HEAD(&p->preempt_notifiers); ++#endif ++ ++#ifdef CONFIG_COMPACTION ++ p->capture_control = NULL; ++#endif ++} ++ ++/* ++ * fork()/clone()-time setup: ++ */ ++int sched_fork(unsigned long clone_flags, struct task_struct *p) ++{ ++ unsigned long flags; ++ int cpu = get_cpu(); ++ struct rq *rq = this_rq(); ++ ++ __sched_fork(clone_flags, p); ++ /* ++ * We mark the process as NEW here. This guarantees that ++ * nobody will actually run it, and a signal or other external ++ * event cannot wake it up and insert it on the runqueue either. ++ */ ++ p->state = TASK_NEW; ++ ++ /* ++ * Make sure we do not leak PI boosting priority to the child. ++ */ ++ p->prio = current->normal_prio; ++ ++ /* ++ * Revert to default priority/policy on fork if requested. ++ */ ++ if (unlikely(p->sched_reset_on_fork)) { ++ if (task_has_rt_policy(p)) { ++ p->policy = SCHED_NORMAL; ++ p->static_prio = NICE_TO_PRIO(0); ++ p->rt_priority = 0; ++ } else if (PRIO_TO_NICE(p->static_prio) < 0) ++ p->static_prio = NICE_TO_PRIO(0); ++ ++ p->prio = p->normal_prio = normal_prio(p); ++ ++ /* ++ * We don't need the reset flag anymore after the fork. It has ++ * fulfilled its duty: ++ */ ++ p->sched_reset_on_fork = 0; ++ } ++ ++ p->boost_prio = (p->boost_prio < 0) ? ++ p->boost_prio + MAX_PRIORITY_ADJ : MAX_PRIORITY_ADJ; ++ /* ++ * Share the timeslice between parent and child, thus the ++ * total amount of pending timeslices in the system doesn't change, ++ * resulting in more scheduling fairness. ++ */ ++ raw_spin_lock_irqsave(&rq->lock, flags); ++ rq->curr->time_slice /= 2; ++ p->time_slice = rq->curr->time_slice; ++#ifdef CONFIG_SCHED_HRTICK ++ hrtick_start(rq, rq->curr->time_slice); ++#endif ++ ++ if (p->time_slice < RESCHED_NS) { ++ p->time_slice = sched_timeslice_ns; ++ resched_curr(rq); ++ } ++ raw_spin_unlock_irqrestore(&rq->lock, flags); ++ ++ /* ++ * The child is not yet in the pid-hash so no cgroup attach races, ++ * and the cgroup is pinned to this child due to cgroup_fork() ++ * is ran before sched_fork(). ++ * ++ * Silence PROVE_RCU. ++ */ ++ raw_spin_lock_irqsave(&p->pi_lock, flags); ++ /* ++ * We're setting the CPU for the first time, we don't migrate, ++ * so use __set_task_cpu(). ++ */ ++ __set_task_cpu(p, cpu); ++ raw_spin_unlock_irqrestore(&p->pi_lock, flags); ++ ++#ifdef CONFIG_SCHED_INFO ++ if (unlikely(sched_info_on())) ++ memset(&p->sched_info, 0, sizeof(p->sched_info)); ++#endif ++ init_task_preempt_count(p); ++ ++ put_cpu(); ++ return 0; ++} ++ ++#ifdef CONFIG_SCHEDSTATS ++ ++DEFINE_STATIC_KEY_FALSE(sched_schedstats); ++static bool __initdata __sched_schedstats = false; ++ ++static void set_schedstats(bool enabled) ++{ ++ if (enabled) ++ static_branch_enable(&sched_schedstats); ++ else ++ static_branch_disable(&sched_schedstats); ++} ++ ++void force_schedstat_enabled(void) ++{ ++ if (!schedstat_enabled()) { ++ pr_info("kernel profiling enabled schedstats, disable via kernel.sched_schedstats.\n"); ++ static_branch_enable(&sched_schedstats); ++ } ++} ++ ++static int __init setup_schedstats(char *str) ++{ ++ int ret = 0; ++ if (!str) ++ goto out; ++ ++ /* ++ * This code is called before jump labels have been set up, so we can't ++ * change the static branch directly just yet. Instead set a temporary ++ * variable so init_schedstats() can do it later. ++ */ ++ if (!strcmp(str, "enable")) { ++ __sched_schedstats = true; ++ ret = 1; ++ } else if (!strcmp(str, "disable")) { ++ __sched_schedstats = false; ++ ret = 1; ++ } ++out: ++ if (!ret) ++ pr_warn("Unable to parse schedstats=\n"); ++ ++ return ret; ++} ++__setup("schedstats=", setup_schedstats); ++ ++static void __init init_schedstats(void) ++{ ++ set_schedstats(__sched_schedstats); ++} ++ ++#ifdef CONFIG_PROC_SYSCTL ++int sysctl_schedstats(struct ctl_table *table, int write, ++ void __user *buffer, size_t *lenp, loff_t *ppos) ++{ ++ struct ctl_table t; ++ int err; ++ int state = static_branch_likely(&sched_schedstats); ++ ++ if (write && !capable(CAP_SYS_ADMIN)) ++ return -EPERM; ++ ++ t = *table; ++ t.data = &state; ++ err = proc_dointvec_minmax(&t, write, buffer, lenp, ppos); ++ if (err < 0) ++ return err; ++ if (write) ++ set_schedstats(state); ++ return err; ++} ++#endif /* CONFIG_PROC_SYSCTL */ ++#else /* !CONFIG_SCHEDSTATS */ ++static inline void init_schedstats(void) {} ++#endif /* CONFIG_SCHEDSTATS */ ++ ++/* ++ * wake_up_new_task - wake up a newly created task for the first time. ++ * ++ * This function will do some initial scheduler statistics housekeeping ++ * that must be done for every newly created context, then puts the task ++ * on the runqueue and wakes it. ++ */ ++void wake_up_new_task(struct task_struct *p) ++{ ++ unsigned long flags; ++ struct rq *rq; ++ ++ raw_spin_lock_irqsave(&p->pi_lock, flags); ++ ++ p->state = TASK_RUNNING; ++ ++ rq = cpu_rq(select_task_rq(p)); ++#ifdef CONFIG_SMP ++ /* ++ * Fork balancing, do it here and not earlier because: ++ * - cpus_ptr can change in the fork path ++ * - any previously selected CPU might disappear through hotplug ++ * Use __set_task_cpu() to avoid calling sched_class::migrate_task_rq, ++ * as we're not fully set-up yet. ++ */ ++ __set_task_cpu(p, cpu_of(rq)); ++#endif ++ ++ raw_spin_lock(&rq->lock); ++ ++ update_rq_clock(rq); ++ activate_task(p, rq); ++ trace_sched_wakeup_new(p); ++ check_preempt_curr(rq); ++ ++ raw_spin_unlock(&rq->lock); ++ raw_spin_unlock_irqrestore(&p->pi_lock, flags); ++} ++ ++#ifdef CONFIG_PREEMPT_NOTIFIERS ++ ++static DEFINE_STATIC_KEY_FALSE(preempt_notifier_key); ++ ++void preempt_notifier_inc(void) ++{ ++ static_branch_inc(&preempt_notifier_key); ++} ++EXPORT_SYMBOL_GPL(preempt_notifier_inc); ++ ++void preempt_notifier_dec(void) ++{ ++ static_branch_dec(&preempt_notifier_key); ++} ++EXPORT_SYMBOL_GPL(preempt_notifier_dec); ++ ++/** ++ * preempt_notifier_register - tell me when current is being preempted & rescheduled ++ * @notifier: notifier struct to register ++ */ ++void preempt_notifier_register(struct preempt_notifier *notifier) ++{ ++ if (!static_branch_unlikely(&preempt_notifier_key)) ++ WARN(1, "registering preempt_notifier while notifiers disabled\n"); ++ ++ hlist_add_head(¬ifier->link, ¤t->preempt_notifiers); ++} ++EXPORT_SYMBOL_GPL(preempt_notifier_register); ++ ++/** ++ * preempt_notifier_unregister - no longer interested in preemption notifications ++ * @notifier: notifier struct to unregister ++ * ++ * This is *not* safe to call from within a preemption notifier. ++ */ ++void preempt_notifier_unregister(struct preempt_notifier *notifier) ++{ ++ hlist_del(¬ifier->link); ++} ++EXPORT_SYMBOL_GPL(preempt_notifier_unregister); ++ ++static void __fire_sched_in_preempt_notifiers(struct task_struct *curr) ++{ ++ struct preempt_notifier *notifier; ++ ++ hlist_for_each_entry(notifier, &curr->preempt_notifiers, link) ++ notifier->ops->sched_in(notifier, raw_smp_processor_id()); ++} ++ ++static __always_inline void fire_sched_in_preempt_notifiers(struct task_struct *curr) ++{ ++ if (static_branch_unlikely(&preempt_notifier_key)) ++ __fire_sched_in_preempt_notifiers(curr); ++} ++ ++static void ++__fire_sched_out_preempt_notifiers(struct task_struct *curr, ++ struct task_struct *next) ++{ ++ struct preempt_notifier *notifier; ++ ++ hlist_for_each_entry(notifier, &curr->preempt_notifiers, link) ++ notifier->ops->sched_out(notifier, next); ++} ++ ++static __always_inline void ++fire_sched_out_preempt_notifiers(struct task_struct *curr, ++ struct task_struct *next) ++{ ++ if (static_branch_unlikely(&preempt_notifier_key)) ++ __fire_sched_out_preempt_notifiers(curr, next); ++} ++ ++#else /* !CONFIG_PREEMPT_NOTIFIERS */ ++ ++static inline void fire_sched_in_preempt_notifiers(struct task_struct *curr) ++{ ++} ++ ++static inline void ++fire_sched_out_preempt_notifiers(struct task_struct *curr, ++ struct task_struct *next) ++{ ++} ++ ++#endif /* CONFIG_PREEMPT_NOTIFIERS */ ++ ++static inline void prepare_task(struct task_struct *next) ++{ ++ /* ++ * Claim the task as running, we do this before switching to it ++ * such that any running task will have this set. ++ */ ++ next->on_cpu = 1; ++} ++ ++static inline void finish_task(struct task_struct *prev) ++{ ++#ifdef CONFIG_SMP ++ /* ++ * After ->on_cpu is cleared, the task can be moved to a different CPU. ++ * We must ensure this doesn't happen until the switch is completely ++ * finished. ++ * ++ * In particular, the load of prev->state in finish_task_switch() must ++ * happen before this. ++ * ++ * Pairs with the smp_cond_load_acquire() in try_to_wake_up(). ++ */ ++ smp_store_release(&prev->on_cpu, 0); ++#else ++ prev->on_cpu = 0; ++#endif ++} ++ ++static inline void ++prepare_lock_switch(struct rq *rq, struct task_struct *next) ++{ ++ /* ++ * Since the runqueue lock will be released by the next ++ * task (which is an invalid locking op but in the case ++ * of the scheduler it's an obvious special-case), so we ++ * do an early lockdep release here: ++ */ ++ spin_release(&rq->lock.dep_map, _THIS_IP_); ++#ifdef CONFIG_DEBUG_SPINLOCK ++ /* this is a valid case when another task releases the spinlock */ ++ rq->lock.owner = next; ++#endif ++} ++ ++static inline void finish_lock_switch(struct rq *rq) ++{ ++ /* ++ * If we are tracking spinlock dependencies then we have to ++ * fix up the runqueue lock - which gets 'carried over' from ++ * prev into current: ++ */ ++ spin_acquire(&rq->lock.dep_map, 0, 0, _THIS_IP_); ++ raw_spin_unlock_irq(&rq->lock); ++} ++ ++/** ++ * prepare_task_switch - prepare to switch tasks ++ * @rq: the runqueue preparing to switch ++ * @next: the task we are going to switch to. ++ * ++ * This is called with the rq lock held and interrupts off. It must ++ * be paired with a subsequent finish_task_switch after the context ++ * switch. ++ * ++ * prepare_task_switch sets up locking and calls architecture specific ++ * hooks. ++ */ ++static inline void ++prepare_task_switch(struct rq *rq, struct task_struct *prev, ++ struct task_struct *next) ++{ ++ kcov_prepare_switch(prev); ++ sched_info_switch(rq, prev, next); ++ perf_event_task_sched_out(prev, next); ++ rseq_preempt(prev); ++ fire_sched_out_preempt_notifiers(prev, next); ++ prepare_task(next); ++ prepare_arch_switch(next); ++} ++ ++/** ++ * finish_task_switch - clean up after a task-switch ++ * @rq: runqueue associated with task-switch ++ * @prev: the thread we just switched away from. ++ * ++ * finish_task_switch must be called after the context switch, paired ++ * with a prepare_task_switch call before the context switch. ++ * finish_task_switch will reconcile locking set up by prepare_task_switch, ++ * and do any other architecture-specific cleanup actions. ++ * ++ * Note that we may have delayed dropping an mm in context_switch(). If ++ * so, we finish that here outside of the runqueue lock. (Doing it ++ * with the lock held can cause deadlocks; see schedule() for ++ * details.) ++ * ++ * The context switch have flipped the stack from under us and restored the ++ * local variables which were saved when this task called schedule() in the ++ * past. prev == current is still correct but we need to recalculate this_rq ++ * because prev may have moved to another CPU. ++ */ ++static struct rq *finish_task_switch(struct task_struct *prev) ++ __releases(rq->lock) ++{ ++ struct rq *rq = this_rq(); ++ struct mm_struct *mm = rq->prev_mm; ++ long prev_state; ++ ++ /* ++ * The previous task will have left us with a preempt_count of 2 ++ * because it left us after: ++ * ++ * schedule() ++ * preempt_disable(); // 1 ++ * __schedule() ++ * raw_spin_lock_irq(&rq->lock) // 2 ++ * ++ * Also, see FORK_PREEMPT_COUNT. ++ */ ++ if (WARN_ONCE(preempt_count() != 2*PREEMPT_DISABLE_OFFSET, ++ "corrupted preempt_count: %s/%d/0x%x\n", ++ current->comm, current->pid, preempt_count())) ++ preempt_count_set(FORK_PREEMPT_COUNT); ++ ++ rq->prev_mm = NULL; ++ ++ /* ++ * A task struct has one reference for the use as "current". ++ * If a task dies, then it sets TASK_DEAD in tsk->state and calls ++ * schedule one last time. The schedule call will never return, and ++ * the scheduled task must drop that reference. ++ * ++ * We must observe prev->state before clearing prev->on_cpu (in ++ * finish_task), otherwise a concurrent wakeup can get prev ++ * running on another CPU and we could rave with its RUNNING -> DEAD ++ * transition, resulting in a double drop. ++ */ ++ prev_state = prev->state; ++ vtime_task_switch(prev); ++ perf_event_task_sched_in(prev, current); ++ finish_task(prev); ++ finish_lock_switch(rq); ++ finish_arch_post_lock_switch(); ++ kcov_finish_switch(current); ++ ++ fire_sched_in_preempt_notifiers(current); ++ /* ++ * When switching through a kernel thread, the loop in ++ * membarrier_{private,global}_expedited() may have observed that ++ * kernel thread and not issued an IPI. It is therefore possible to ++ * schedule between user->kernel->user threads without passing though ++ * switch_mm(). Membarrier requires a barrier after storing to ++ * rq->curr, before returning to userspace, so provide them here: ++ * ++ * - a full memory barrier for {PRIVATE,GLOBAL}_EXPEDITED, implicitly ++ * provided by mmdrop(), ++ * - a sync_core for SYNC_CORE. ++ */ ++ if (mm) { ++ membarrier_mm_sync_core_before_usermode(mm); ++ mmdrop(mm); ++ } ++ if (unlikely(prev_state == TASK_DEAD)) { ++ /* ++ * Remove function-return probe instances associated with this ++ * task and put them back on the free list. ++ */ ++ kprobe_flush_task(prev); ++ ++ /* Task is done with its stack. */ ++ put_task_stack(prev); ++ ++ put_task_struct_rcu_user(prev); ++ } ++ ++ tick_nohz_task_switch(); ++ return rq; ++} ++ ++/** ++ * schedule_tail - first thing a freshly forked thread must call. ++ * @prev: the thread we just switched away from. ++ */ ++asmlinkage __visible void schedule_tail(struct task_struct *prev) ++ __releases(rq->lock) ++{ ++ struct rq *rq; ++ ++ /* ++ * New tasks start with FORK_PREEMPT_COUNT, see there and ++ * finish_task_switch() for details. ++ * ++ * finish_task_switch() will drop rq->lock() and lower preempt_count ++ * and the preempt_enable() will end up enabling preemption (on ++ * PREEMPT_COUNT kernels). ++ */ ++ ++ rq = finish_task_switch(prev); ++ preempt_enable(); ++ ++ if (current->set_child_tid) ++ put_user(task_pid_vnr(current), current->set_child_tid); ++ ++ calculate_sigpending(); ++} ++ ++/* ++ * context_switch - switch to the new MM and the new thread's register state. ++ */ ++static __always_inline struct rq * ++context_switch(struct rq *rq, struct task_struct *prev, ++ struct task_struct *next) ++{ ++ prepare_task_switch(rq, prev, next); ++ ++ /* ++ * For paravirt, this is coupled with an exit in switch_to to ++ * combine the page table reload and the switch backend into ++ * one hypercall. ++ */ ++ arch_start_context_switch(prev); ++ ++ /* ++ * kernel -> kernel lazy + transfer active ++ * user -> kernel lazy + mmgrab() active ++ * ++ * kernel -> user switch + mmdrop() active ++ * user -> user switch ++ */ ++ if (!next->mm) { // to kernel ++ enter_lazy_tlb(prev->active_mm, next); ++ ++ next->active_mm = prev->active_mm; ++ if (prev->mm) // from user ++ mmgrab(prev->active_mm); ++ else ++ prev->active_mm = NULL; ++ } else { // to user ++ membarrier_switch_mm(rq, prev->active_mm, next->mm); ++ /* ++ * sys_membarrier() requires an smp_mb() between setting ++ * rq->curr / membarrier_switch_mm() and returning to userspace. ++ * ++ * The below provides this either through switch_mm(), or in ++ * case 'prev->active_mm == next->mm' through ++ * finish_task_switch()'s mmdrop(). ++ */ ++ switch_mm_irqs_off(prev->active_mm, next->mm, next); ++ ++ if (!prev->mm) { // from kernel ++ /* will mmdrop() in finish_task_switch(). */ ++ rq->prev_mm = prev->active_mm; ++ prev->active_mm = NULL; ++ } ++ } ++ ++ prepare_lock_switch(rq, next); ++ ++ /* Here we just switch the register state and the stack. */ ++ switch_to(prev, next, prev); ++ barrier(); ++ ++ return finish_task_switch(prev); ++} ++ ++/* ++ * nr_running, nr_uninterruptible and nr_context_switches: ++ * ++ * externally visible scheduler statistics: current number of runnable ++ * threads, total number of context switches performed since bootup. ++ */ ++unsigned long nr_running(void) ++{ ++ unsigned long i, sum = 0; ++ ++ for_each_online_cpu(i) ++ sum += cpu_rq(i)->nr_running; ++ ++ return sum; ++} ++ ++/* ++ * Check if only the current task is running on the CPU. ++ * ++ * Caution: this function does not check that the caller has disabled ++ * preemption, thus the result might have a time-of-check-to-time-of-use ++ * race. The caller is responsible to use it correctly, for example: ++ * ++ * - from a non-preemptible section (of course) ++ * ++ * - from a thread that is bound to a single CPU ++ * ++ * - in a loop with very short iterations (e.g. a polling loop) ++ */ ++bool single_task_running(void) ++{ ++ return raw_rq()->nr_running == 1; ++} ++EXPORT_SYMBOL(single_task_running); ++ ++unsigned long long nr_context_switches(void) ++{ ++ int i; ++ unsigned long long sum = 0; ++ ++ for_each_possible_cpu(i) ++ sum += cpu_rq(i)->nr_switches; ++ ++ return sum; ++} ++ ++/* ++ * Consumers of these two interfaces, like for example the cpuidle menu ++ * governor, are using nonsensical data. Preferring shallow idle state selection ++ * for a CPU that has IO-wait which might not even end up running the task when ++ * it does become runnable. ++ */ ++ ++unsigned long nr_iowait_cpu(int cpu) ++{ ++ return atomic_read(&cpu_rq(cpu)->nr_iowait); ++} ++ ++/* ++ * IO-wait accounting, and how its mostly bollocks (on SMP). ++ * ++ * The idea behind IO-wait account is to account the idle time that we could ++ * have spend running if it were not for IO. That is, if we were to improve the ++ * storage performance, we'd have a proportional reduction in IO-wait time. ++ * ++ * This all works nicely on UP, where, when a task blocks on IO, we account ++ * idle time as IO-wait, because if the storage were faster, it could've been ++ * running and we'd not be idle. ++ * ++ * This has been extended to SMP, by doing the same for each CPU. This however ++ * is broken. ++ * ++ * Imagine for instance the case where two tasks block on one CPU, only the one ++ * CPU will have IO-wait accounted, while the other has regular idle. Even ++ * though, if the storage were faster, both could've ran at the same time, ++ * utilising both CPUs. ++ * ++ * This means, that when looking globally, the current IO-wait accounting on ++ * SMP is a lower bound, by reason of under accounting. ++ * ++ * Worse, since the numbers are provided per CPU, they are sometimes ++ * interpreted per CPU, and that is nonsensical. A blocked task isn't strictly ++ * associated with any one particular CPU, it can wake to another CPU than it ++ * blocked on. This means the per CPU IO-wait number is meaningless. ++ * ++ * Task CPU affinities can make all that even more 'interesting'. ++ */ ++ ++unsigned long nr_iowait(void) ++{ ++ unsigned long i, sum = 0; ++ ++ for_each_possible_cpu(i) ++ sum += nr_iowait_cpu(i); ++ ++ return sum; ++} ++ ++#ifdef CONFIG_SMP ++ ++/* ++ * sched_exec - execve() is a valuable balancing opportunity, because at ++ * this point the task has the smallest effective memory and cache ++ * footprint. ++ */ ++void sched_exec(void) ++{ ++ struct task_struct *p = current; ++ int dest_cpu; ++ ++ if (task_rq(p)->nr_running < 2) ++ return; ++ ++ dest_cpu = cpumask_any_and(p->cpus_ptr, &sched_rq_watermark[IDLE_WM]); ++ if ( dest_cpu < nr_cpu_ids) { ++#ifdef CONFIG_SCHED_SMT ++ int smt = cpumask_any_and(p->cpus_ptr, &sched_sg_idle_mask); ++ if (smt < nr_cpu_ids) ++ dest_cpu = smt; ++#endif ++ if (likely(cpu_active(dest_cpu))) { ++ struct migration_arg arg = { p, dest_cpu }; ++ ++ stop_one_cpu(task_cpu(p), migration_cpu_stop, &arg); ++ return; ++ } ++ } ++} ++ ++#endif ++ ++DEFINE_PER_CPU(struct kernel_stat, kstat); ++DEFINE_PER_CPU(struct kernel_cpustat, kernel_cpustat); ++ ++EXPORT_PER_CPU_SYMBOL(kstat); ++EXPORT_PER_CPU_SYMBOL(kernel_cpustat); ++ ++static inline void update_curr(struct rq *rq, struct task_struct *p) ++{ ++ s64 ns = rq->clock_task - p->last_ran; ++ ++ p->sched_time += ns; ++ account_group_exec_runtime(p, ns); ++ ++ p->time_slice -= ns; ++ p->last_ran = rq->clock_task; ++} ++ ++/* ++ * Return accounted runtime for the task. ++ * Return separately the current's pending runtime that have not been ++ * accounted yet. ++ */ ++unsigned long long task_sched_runtime(struct task_struct *p) ++{ ++ unsigned long flags; ++ struct rq *rq; ++ raw_spinlock_t *lock; ++ u64 ns; ++ ++#if defined(CONFIG_64BIT) && defined(CONFIG_SMP) ++ /* ++ * 64-bit doesn't need locks to atomically read a 64-bit value. ++ * So we have a optimization chance when the task's delta_exec is 0. ++ * Reading ->on_cpu is racy, but this is ok. ++ * ++ * If we race with it leaving CPU, we'll take a lock. So we're correct. ++ * If we race with it entering CPU, unaccounted time is 0. This is ++ * indistinguishable from the read occurring a few cycles earlier. ++ * If we see ->on_cpu without ->on_rq, the task is leaving, and has ++ * been accounted, so we're correct here as well. ++ */ ++ if (!p->on_cpu || !task_on_rq_queued(p)) ++ return tsk_seruntime(p); ++#endif ++ ++ rq = task_access_lock_irqsave(p, &lock, &flags); ++ /* ++ * Must be ->curr _and_ ->on_rq. If dequeued, we would ++ * project cycles that may never be accounted to this ++ * thread, breaking clock_gettime(). ++ */ ++ if (p == rq->curr && task_on_rq_queued(p)) { ++ update_rq_clock(rq); ++ update_curr(rq, p); ++ } ++ ns = tsk_seruntime(p); ++ task_access_unlock_irqrestore(p, lock, &flags); ++ ++ return ns; ++} ++ ++/* This manages tasks that have run out of timeslice during a scheduler_tick */ ++static inline void scheduler_task_tick(struct rq *rq) ++{ ++ struct task_struct *p = rq->curr; ++ ++ if (is_idle_task(p)) ++ return; ++ ++ update_curr(rq, p); ++ cpufreq_update_util(rq, 0); ++ ++ /* ++ * Tasks have less than RESCHED_NS of time slice left they will be ++ * rescheduled. ++ */ ++ if (p->time_slice >= RESCHED_NS) ++ return; ++ set_tsk_need_resched(p); ++ set_preempt_need_resched(); ++} ++ ++/* ++ * This function gets called by the timer code, with HZ frequency. ++ * We call it with interrupts disabled. ++ */ ++void scheduler_tick(void) ++{ ++ int cpu __maybe_unused = smp_processor_id(); ++ struct rq *rq = cpu_rq(cpu); ++ ++ sched_clock_tick(); ++ ++ raw_spin_lock(&rq->lock); ++ update_rq_clock(rq); ++ ++ scheduler_task_tick(rq); ++ calc_global_load_tick(rq); ++ psi_task_tick(rq); ++ ++ rq->last_tick = rq->clock; ++ raw_spin_unlock(&rq->lock); ++ ++ perf_event_task_tick(); ++} ++ ++#ifdef CONFIG_SCHED_SMT ++static inline int active_load_balance_cpu_stop(void *data) ++{ ++ struct rq *rq = this_rq(); ++ struct task_struct *p = data; ++ cpumask_t tmp; ++ unsigned long flags; ++ ++ local_irq_save(flags); ++ ++ raw_spin_lock(&p->pi_lock); ++ raw_spin_lock(&rq->lock); ++ ++ rq->active_balance = 0; ++ /* _something_ may have changed the task, double check again */ ++ if (task_on_rq_queued(p) && task_rq(p) == rq && ++ cpumask_and(&tmp, p->cpus_ptr, &sched_sg_idle_mask)) { ++ int cpu = cpu_of(rq); ++ int dcpu = __best_mask_cpu(cpu, &tmp, ++ per_cpu(sched_cpu_llc_mask, cpu)); ++ rq = move_queued_task(rq, p, dcpu); ++ } ++ ++ raw_spin_unlock(&rq->lock); ++ raw_spin_unlock(&p->pi_lock); ++ ++ local_irq_restore(flags); ++ ++ return 0; ++} ++ ++/* sg_balance_trigger - trigger slibing group balance for @cpu */ ++static inline int sg_balance_trigger(const int cpu) ++{ ++ struct rq *rq= cpu_rq(cpu); ++ unsigned long flags; ++ struct task_struct *curr; ++ int res; ++ ++ if (!raw_spin_trylock_irqsave(&rq->lock, flags)) ++ return 0; ++ curr = rq->curr; ++ res = (!is_idle_task(curr)) && (1 == rq->nr_running) &&\ ++ cpumask_intersects(curr->cpus_ptr, &sched_sg_idle_mask) &&\ ++ (!rq->active_balance); ++ ++ if (res) ++ rq->active_balance = 1; ++ ++ raw_spin_unlock_irqrestore(&rq->lock, flags); ++ ++ if (res) ++ stop_one_cpu_nowait(cpu, active_load_balance_cpu_stop, ++ curr, &rq->active_balance_work); ++ return res; ++} ++ ++/* ++ * sg_balance_check - slibing group balance check for run queue @rq ++ */ ++static inline void sg_balance_check(struct rq *rq) ++{ ++ cpumask_t chk; ++ int cpu; ++ ++ /* exit when no sg in idle */ ++ if (cpumask_empty(&sched_sg_idle_mask)) ++ return; ++ ++ cpu = cpu_of(rq); ++ /* ++ * Only cpu in slibing idle group will do the checking and then ++ * find potential cpus which can migrate the current running task ++ */ ++ if (cpumask_test_cpu(cpu, &sched_sg_idle_mask) && ++ cpumask_andnot(&chk, cpu_online_mask, &sched_rq_pending_mask) && ++ cpumask_andnot(&chk, &chk, &sched_rq_watermark[IDLE_WM])) { ++ int i, tried = 0; ++ ++ for_each_cpu_wrap(i, &chk, cpu) { ++ if (cpumask_subset(cpu_smt_mask(i), &chk)) { ++ if (sg_balance_trigger(i)) ++ return; ++ if (tried) ++ return; ++ tried++; ++ } ++ } ++ } ++} ++#endif /* CONFIG_SCHED_SMT */ ++ ++#ifdef CONFIG_NO_HZ_FULL ++ ++struct tick_work { ++ int cpu; ++ atomic_t state; ++ struct delayed_work work; ++}; ++/* Values for ->state, see diagram below. */ ++#define TICK_SCHED_REMOTE_OFFLINE 0 ++#define TICK_SCHED_REMOTE_OFFLINING 1 ++#define TICK_SCHED_REMOTE_RUNNING 2 ++ ++/* ++ * State diagram for ->state: ++ * ++ * ++ * TICK_SCHED_REMOTE_OFFLINE ++ * | ^ ++ * | | ++ * | | sched_tick_remote() ++ * | | ++ * | | ++ * +--TICK_SCHED_REMOTE_OFFLINING ++ * | ^ ++ * | | ++ * sched_tick_start() | | sched_tick_stop() ++ * | | ++ * V | ++ * TICK_SCHED_REMOTE_RUNNING ++ * ++ * ++ * Other transitions get WARN_ON_ONCE(), except that sched_tick_remote() ++ * and sched_tick_start() are happy to leave the state in RUNNING. ++ */ ++ ++static struct tick_work __percpu *tick_work_cpu; ++ ++static void sched_tick_remote(struct work_struct *work) ++{ ++ struct delayed_work *dwork = to_delayed_work(work); ++ struct tick_work *twork = container_of(dwork, struct tick_work, work); ++ int cpu = twork->cpu; ++ struct rq *rq = cpu_rq(cpu); ++ struct task_struct *curr; ++ unsigned long flags; ++ u64 delta; ++ int os; ++ ++ /* ++ * Handle the tick only if it appears the remote CPU is running in full ++ * dynticks mode. The check is racy by nature, but missing a tick or ++ * having one too much is no big deal because the scheduler tick updates ++ * statistics and checks timeslices in a time-independent way, regardless ++ * of when exactly it is running. ++ */ ++ if (idle_cpu(cpu) || !tick_nohz_tick_stopped_cpu(cpu)) ++ goto out_requeue; ++ ++ raw_spin_lock_irqsave(&rq->lock, flags); ++ curr = rq->curr; ++ ++ if (is_idle_task(curr) || cpu_is_offline(cpu)) ++ goto out_unlock; ++ ++ update_rq_clock(rq); ++ delta = rq_clock_task(rq) - curr->last_ran; ++ ++ /* ++ * Make sure the next tick runs within a reasonable ++ * amount of time. ++ */ ++ WARN_ON_ONCE(delta > (u64)NSEC_PER_SEC * 3); ++ scheduler_task_tick(rq); ++ ++out_unlock: ++ raw_spin_unlock_irqrestore(&rq->lock, flags); ++ ++out_requeue: ++ /* ++ * Run the remote tick once per second (1Hz). This arbitrary ++ * frequency is large enough to avoid overload but short enough ++ * to keep scheduler internal stats reasonably up to date. But ++ * first update state to reflect hotplug activity if required. ++ */ ++ os = atomic_fetch_add_unless(&twork->state, -1, TICK_SCHED_REMOTE_RUNNING); ++ WARN_ON_ONCE(os == TICK_SCHED_REMOTE_OFFLINE); ++ if (os == TICK_SCHED_REMOTE_RUNNING) ++ queue_delayed_work(system_unbound_wq, dwork, HZ); ++} ++ ++static void sched_tick_start(int cpu) ++{ ++ int os; ++ struct tick_work *twork; ++ ++ if (housekeeping_cpu(cpu, HK_FLAG_TICK)) ++ return; ++ ++ WARN_ON_ONCE(!tick_work_cpu); ++ ++ twork = per_cpu_ptr(tick_work_cpu, cpu); ++ os = atomic_xchg(&twork->state, TICK_SCHED_REMOTE_RUNNING); ++ WARN_ON_ONCE(os == TICK_SCHED_REMOTE_RUNNING); ++ if (os == TICK_SCHED_REMOTE_OFFLINE) { ++ twork->cpu = cpu; ++ INIT_DELAYED_WORK(&twork->work, sched_tick_remote); ++ queue_delayed_work(system_unbound_wq, &twork->work, HZ); ++ } ++} ++ ++#ifdef CONFIG_HOTPLUG_CPU ++static void sched_tick_stop(int cpu) ++{ ++ struct tick_work *twork; ++ ++ if (housekeeping_cpu(cpu, HK_FLAG_TICK)) ++ return; ++ ++ WARN_ON_ONCE(!tick_work_cpu); ++ ++ twork = per_cpu_ptr(tick_work_cpu, cpu); ++ cancel_delayed_work_sync(&twork->work); ++} ++#endif /* CONFIG_HOTPLUG_CPU */ ++ ++int __init sched_tick_offload_init(void) ++{ ++ tick_work_cpu = alloc_percpu(struct tick_work); ++ BUG_ON(!tick_work_cpu); ++ return 0; ++} ++ ++#else /* !CONFIG_NO_HZ_FULL */ ++static inline void sched_tick_start(int cpu) { } ++static inline void sched_tick_stop(int cpu) { } ++#endif ++ ++#if defined(CONFIG_PREEMPTION) && (defined(CONFIG_DEBUG_PREEMPT) || \ ++ defined(CONFIG_PREEMPT_TRACER)) ++/* ++ * If the value passed in is equal to the current preempt count ++ * then we just disabled preemption. Start timing the latency. ++ */ ++static inline void preempt_latency_start(int val) ++{ ++ if (preempt_count() == val) { ++ unsigned long ip = get_lock_parent_ip(); ++#ifdef CONFIG_DEBUG_PREEMPT ++ current->preempt_disable_ip = ip; ++#endif ++ trace_preempt_off(CALLER_ADDR0, ip); ++ } ++} ++ ++void preempt_count_add(int val) ++{ ++#ifdef CONFIG_DEBUG_PREEMPT ++ /* ++ * Underflow? ++ */ ++ if (DEBUG_LOCKS_WARN_ON((preempt_count() < 0))) ++ return; ++#endif ++ __preempt_count_add(val); ++#ifdef CONFIG_DEBUG_PREEMPT ++ /* ++ * Spinlock count overflowing soon? ++ */ ++ DEBUG_LOCKS_WARN_ON((preempt_count() & PREEMPT_MASK) >= ++ PREEMPT_MASK - 10); ++#endif ++ preempt_latency_start(val); ++} ++EXPORT_SYMBOL(preempt_count_add); ++NOKPROBE_SYMBOL(preempt_count_add); ++ ++/* ++ * If the value passed in equals to the current preempt count ++ * then we just enabled preemption. Stop timing the latency. ++ */ ++static inline void preempt_latency_stop(int val) ++{ ++ if (preempt_count() == val) ++ trace_preempt_on(CALLER_ADDR0, get_lock_parent_ip()); ++} ++ ++void preempt_count_sub(int val) ++{ ++#ifdef CONFIG_DEBUG_PREEMPT ++ /* ++ * Underflow? ++ */ ++ if (DEBUG_LOCKS_WARN_ON(val > preempt_count())) ++ return; ++ /* ++ * Is the spinlock portion underflowing? ++ */ ++ if (DEBUG_LOCKS_WARN_ON((val < PREEMPT_MASK) && ++ !(preempt_count() & PREEMPT_MASK))) ++ return; ++#endif ++ ++ preempt_latency_stop(val); ++ __preempt_count_sub(val); ++} ++EXPORT_SYMBOL(preempt_count_sub); ++NOKPROBE_SYMBOL(preempt_count_sub); ++ ++#else ++static inline void preempt_latency_start(int val) { } ++static inline void preempt_latency_stop(int val) { } ++#endif ++ ++static inline unsigned long get_preempt_disable_ip(struct task_struct *p) ++{ ++#ifdef CONFIG_DEBUG_PREEMPT ++ return p->preempt_disable_ip; ++#else ++ return 0; ++#endif ++} ++ ++/* ++ * Print scheduling while atomic bug: ++ */ ++static noinline void __schedule_bug(struct task_struct *prev) ++{ ++ /* Save this before calling printk(), since that will clobber it */ ++ unsigned long preempt_disable_ip = get_preempt_disable_ip(current); ++ ++ if (oops_in_progress) ++ return; ++ ++ printk(KERN_ERR "BUG: scheduling while atomic: %s/%d/0x%08x\n", ++ prev->comm, prev->pid, preempt_count()); ++ ++ debug_show_held_locks(prev); ++ print_modules(); ++ if (irqs_disabled()) ++ print_irqtrace_events(prev); ++ if (IS_ENABLED(CONFIG_DEBUG_PREEMPT) ++ && in_atomic_preempt_off()) { ++ pr_err("Preemption disabled at:"); ++ print_ip_sym(preempt_disable_ip); ++ pr_cont("\n"); ++ } ++ if (panic_on_warn) ++ panic("scheduling while atomic\n"); ++ ++ dump_stack(); ++ add_taint(TAINT_WARN, LOCKDEP_STILL_OK); ++} ++ ++/* ++ * Various schedule()-time debugging checks and statistics: ++ */ ++static inline void schedule_debug(struct task_struct *prev, bool preempt) ++{ ++#ifdef CONFIG_SCHED_STACK_END_CHECK ++ if (task_stack_end_corrupted(prev)) ++ panic("corrupted stack end detected inside scheduler\n"); ++#endif ++ ++#ifdef CONFIG_DEBUG_ATOMIC_SLEEP ++ if (!preempt && prev->state && prev->non_block_count) { ++ printk(KERN_ERR "BUG: scheduling in a non-blocking section: %s/%d/%i\n", ++ prev->comm, prev->pid, prev->non_block_count); ++ dump_stack(); ++ add_taint(TAINT_WARN, LOCKDEP_STILL_OK); ++ } ++#endif ++ ++ if (unlikely(in_atomic_preempt_off())) { ++ __schedule_bug(prev); ++ preempt_count_set(PREEMPT_DISABLED); ++ } ++ rcu_sleep_check(); ++ ++ profile_hit(SCHED_PROFILING, __builtin_return_address(0)); ++ ++ schedstat_inc(this_rq()->sched_count); ++} ++ ++#ifdef CONFIG_SMP ++ ++#define SCHED_RQ_NR_MIGRATION (32UL) ++/* ++ * Migrate pending tasks in @rq to @dest_cpu ++ * Will try to migrate mininal of half of @rq nr_running tasks and ++ * SCHED_RQ_NR_MIGRATION to @dest_cpu ++ */ ++static inline int ++migrate_pending_tasks(struct rq *rq, struct rq *dest_rq, const int dest_cpu) ++{ ++ struct task_struct *p, *skip = rq->curr; ++ int nr_migrated = 0; ++ int nr_tries = min(rq->nr_running / 2, SCHED_RQ_NR_MIGRATION); ++ ++ while (skip != rq->idle && nr_tries && ++ (p = rq_next_bmq_task(skip, rq)) != rq->idle) { ++ skip = rq_next_bmq_task(p, rq); ++ if (cpumask_test_cpu(dest_cpu, p->cpus_ptr)) { ++ dequeue_task(p, rq, 0); ++ set_task_cpu(p, dest_cpu); ++ enqueue_task(p, dest_rq, 0); ++ nr_migrated++; ++ } ++ nr_tries--; ++ } ++ ++ return nr_migrated; ++} ++ ++static inline int take_other_rq_tasks(struct rq *rq, int cpu) ++{ ++ struct cpumask *affinity_mask, *end_mask; ++ ++ if (cpumask_empty(&sched_rq_pending_mask)) ++ return 0; ++ ++ affinity_mask = &(per_cpu(sched_cpu_affinity_masks, cpu)[0]); ++ end_mask = per_cpu(sched_cpu_affinity_end_mask, cpu); ++ do { ++ int i; ++ for_each_cpu_and(i, &sched_rq_pending_mask, affinity_mask) { ++ int nr_migrated; ++ struct rq *src_rq; ++ ++ src_rq = cpu_rq(i); ++ if (!do_raw_spin_trylock(&src_rq->lock)) ++ continue; ++ spin_acquire(&src_rq->lock.dep_map, ++ SINGLE_DEPTH_NESTING, 1, _RET_IP_); ++ ++ nr_migrated = migrate_pending_tasks(src_rq, rq, cpu); ++ ++ spin_release(&src_rq->lock.dep_map, _RET_IP_); ++ do_raw_spin_unlock(&src_rq->lock); ++ ++ if (nr_migrated) { ++ cpufreq_update_util(rq, 0); ++ return 1; ++ } ++ } ++ } while (++affinity_mask < end_mask); ++ ++ return 0; ++} ++#endif ++ ++/* ++ * Timeslices below RESCHED_NS are considered as good as expired as there's no ++ * point rescheduling when there's so little time left. ++ */ ++static inline void check_curr(struct task_struct *p, struct rq *rq) ++{ ++ if (rq->idle == p) ++ return; ++ ++ update_curr(rq, p); ++ ++ if (p->time_slice < RESCHED_NS) { ++ p->time_slice = sched_timeslice_ns; ++ if (SCHED_FIFO != p->policy && task_on_rq_queued(p)) { ++ if (SCHED_RR != p->policy) ++ deboost_task(p); ++ requeue_task(p, rq); ++ } ++ } ++} ++ ++static inline struct task_struct * ++choose_next_task(struct rq *rq, int cpu, struct task_struct *prev) ++{ ++ struct task_struct *next; ++ ++ if (unlikely(rq->skip)) { ++ next = rq_runnable_task(rq); ++#ifdef CONFIG_SMP ++ if (likely(rq->online)) ++ if (next == rq->idle && take_other_rq_tasks(rq, cpu)) ++ next = rq_runnable_task(rq); ++#endif ++ rq->skip = NULL; ++ return next; ++ } ++ ++ next = rq_first_bmq_task(rq); ++#ifdef CONFIG_SMP ++ if (likely(rq->online)) ++ if (next == rq->idle && take_other_rq_tasks(rq, cpu)) ++ return rq_first_bmq_task(rq); ++#endif ++ return next; ++} ++ ++static inline void set_rq_task(struct rq *rq, struct task_struct *p) ++{ ++ p->last_ran = rq->clock_task; ++ ++ if (unlikely(sched_timeslice_ns == p->time_slice)) ++ rq->last_ts_switch = rq->clock; ++#ifdef CONFIG_HIGH_RES_TIMERS ++ if (p != rq->idle) ++ hrtick_start(rq, p->time_slice); ++#endif ++} ++ ++/* ++ * schedule() is the main scheduler function. ++ * ++ * The main means of driving the scheduler and thus entering this function are: ++ * ++ * 1. Explicit blocking: mutex, semaphore, waitqueue, etc. ++ * ++ * 2. TIF_NEED_RESCHED flag is checked on interrupt and userspace return ++ * paths. For example, see arch/x86/entry_64.S. ++ * ++ * To drive preemption between tasks, the scheduler sets the flag in timer ++ * interrupt handler scheduler_tick(). ++ * ++ * 3. Wakeups don't really cause entry into schedule(). They add a ++ * task to the run-queue and that's it. ++ * ++ * Now, if the new task added to the run-queue preempts the current ++ * task, then the wakeup sets TIF_NEED_RESCHED and schedule() gets ++ * called on the nearest possible occasion: ++ * ++ * - If the kernel is preemptible (CONFIG_PREEMPTION=y): ++ * ++ * - in syscall or exception context, at the next outmost ++ * preempt_enable(). (this might be as soon as the wake_up()'s ++ * spin_unlock()!) ++ * ++ * - in IRQ context, return from interrupt-handler to ++ * preemptible context ++ * ++ * - If the kernel is not preemptible (CONFIG_PREEMPTION is not set) ++ * then at the next: ++ * ++ * - cond_resched() call ++ * - explicit schedule() call ++ * - return from syscall or exception to user-space ++ * - return from interrupt-handler to user-space ++ * ++ * WARNING: must be called with preemption disabled! ++ */ ++static void __sched notrace __schedule(bool preempt) ++{ ++ struct task_struct *prev, *next; ++ unsigned long *switch_count; ++ struct rq *rq; ++ int cpu; ++ ++ cpu = smp_processor_id(); ++ rq = cpu_rq(cpu); ++ prev = rq->curr; ++ ++ schedule_debug(prev, preempt); ++ ++ /* by passing sched_feat(HRTICK) checking which BMQ doesn't support */ ++ hrtick_clear(rq); ++ ++ local_irq_disable(); ++ rcu_note_context_switch(preempt); ++ ++ /* ++ * Make sure that signal_pending_state()->signal_pending() below ++ * can't be reordered with __set_current_state(TASK_INTERRUPTIBLE) ++ * done by the caller to avoid the race with signal_wake_up(). ++ * ++ * The membarrier system call requires a full memory barrier ++ * after coming from user-space, before storing to rq->curr. ++ */ ++ raw_spin_lock(&rq->lock); ++ smp_mb__after_spinlock(); ++ ++ update_rq_clock(rq); ++ ++ switch_count = &prev->nivcsw; ++ if (!preempt && prev->state) { ++ if (signal_pending_state(prev->state, prev)) { ++ prev->state = TASK_RUNNING; ++ } else { ++ if (rq_switch_time(rq) < boost_threshold(prev)) ++ boost_task(prev); ++ deactivate_task(prev, rq); ++ ++ if (prev->in_iowait) { ++ atomic_inc(&rq->nr_iowait); ++ delayacct_blkio_start(); ++ } ++ } ++ switch_count = &prev->nvcsw; ++ } ++ ++ clear_tsk_need_resched(prev); ++ clear_preempt_need_resched(); ++ ++ check_curr(prev, rq); ++ ++ next = choose_next_task(rq, cpu, prev); ++ ++ set_rq_task(rq, next); ++ ++ if (prev != next) { ++ if (MAX_PRIO == next->prio) ++ schedstat_inc(rq->sched_goidle); ++ ++ /* ++ * RCU users of rcu_dereference(rq->curr) may not see ++ * changes to task_struct made by pick_next_task(). ++ */ ++ RCU_INIT_POINTER(rq->curr, next); ++ /* ++ * The membarrier system call requires each architecture ++ * to have a full memory barrier after updating ++ * rq->curr, before returning to user-space. ++ * ++ * Here are the schemes providing that barrier on the ++ * various architectures: ++ * - mm ? switch_mm() : mmdrop() for x86, s390, sparc, PowerPC. ++ * switch_mm() rely on membarrier_arch_switch_mm() on PowerPC. ++ * - finish_lock_switch() for weakly-ordered ++ * architectures where spin_unlock is a full barrier, ++ * - switch_to() for arm64 (weakly-ordered, spin_unlock ++ * is a RELEASE barrier), ++ */ ++ ++*switch_count; ++ rq->nr_switches++; ++ rq->last_ts_switch = rq->clock; ++ ++ trace_sched_switch(preempt, prev, next); ++ ++ /* Also unlocks the rq: */ ++ rq = context_switch(rq, prev, next); ++#ifdef CONFIG_SCHED_SMT ++ sg_balance_check(rq); ++#endif ++ } else ++ raw_spin_unlock_irq(&rq->lock); ++} ++ ++void __noreturn do_task_dead(void) ++{ ++ /* Causes final put_task_struct in finish_task_switch(): */ ++ set_special_state(TASK_DEAD); ++ ++ /* Tell freezer to ignore us: */ ++ current->flags |= PF_NOFREEZE; ++ ++ __schedule(false); ++ BUG(); ++ ++ /* Avoid "noreturn function does return" - but don't continue if BUG() is a NOP: */ ++ for (;;) ++ cpu_relax(); ++} ++ ++static inline void sched_submit_work(struct task_struct *tsk) ++{ ++ if (!tsk->state) ++ return; ++ ++ /* ++ * If a worker went to sleep, notify and ask workqueue whether ++ * it wants to wake up a task to maintain concurrency. ++ * As this function is called inside the schedule() context, ++ * we disable preemption to avoid it calling schedule() again ++ * in the possible wakeup of a kworker. ++ */ ++ if (tsk->flags & (PF_WQ_WORKER | PF_IO_WORKER)) { ++ preempt_disable(); ++ if (tsk->flags & PF_WQ_WORKER) ++ wq_worker_sleeping(tsk); ++ else ++ io_wq_worker_sleeping(tsk); ++ preempt_enable_no_resched(); ++ } ++ ++ if (tsk_is_pi_blocked(tsk)) ++ return; ++ ++ /* ++ * If we are going to sleep and we have plugged IO queued, ++ * make sure to submit it to avoid deadlocks. ++ */ ++ if (blk_needs_flush_plug(tsk)) ++ blk_schedule_flush_plug(tsk); ++} ++ ++static void sched_update_worker(struct task_struct *tsk) ++{ ++ if (tsk->flags & (PF_WQ_WORKER | PF_IO_WORKER)) { ++ if (tsk->flags & PF_WQ_WORKER) ++ wq_worker_running(tsk); ++ else ++ io_wq_worker_running(tsk); ++ } ++} ++ ++asmlinkage __visible void __sched schedule(void) ++{ ++ struct task_struct *tsk = current; ++ ++ sched_submit_work(tsk); ++ do { ++ preempt_disable(); ++ __schedule(false); ++ sched_preempt_enable_no_resched(); ++ } while (need_resched()); ++ sched_update_worker(tsk); ++} ++EXPORT_SYMBOL(schedule); ++ ++/* ++ * synchronize_rcu_tasks() makes sure that no task is stuck in preempted ++ * state (have scheduled out non-voluntarily) by making sure that all ++ * tasks have either left the run queue or have gone into user space. ++ * As idle tasks do not do either, they must not ever be preempted ++ * (schedule out non-voluntarily). ++ * ++ * schedule_idle() is similar to schedule_preempt_disable() except that it ++ * never enables preemption because it does not call sched_submit_work(). ++ */ ++void __sched schedule_idle(void) ++{ ++ /* ++ * As this skips calling sched_submit_work(), which the idle task does ++ * regardless because that function is a nop when the task is in a ++ * TASK_RUNNING state, make sure this isn't used someplace that the ++ * current task can be in any other state. Note, idle is always in the ++ * TASK_RUNNING state. ++ */ ++ WARN_ON_ONCE(current->state); ++ do { ++ __schedule(false); ++ } while (need_resched()); ++} ++ ++#ifdef CONFIG_CONTEXT_TRACKING ++asmlinkage __visible void __sched schedule_user(void) ++{ ++ /* ++ * If we come here after a random call to set_need_resched(), ++ * or we have been woken up remotely but the IPI has not yet arrived, ++ * we haven't yet exited the RCU idle mode. Do it here manually until ++ * we find a better solution. ++ * ++ * NB: There are buggy callers of this function. Ideally we ++ * should warn if prev_state != CONTEXT_USER, but that will trigger ++ * too frequently to make sense yet. ++ */ ++ enum ctx_state prev_state = exception_enter(); ++ schedule(); ++ exception_exit(prev_state); ++} ++#endif ++ ++/** ++ * schedule_preempt_disabled - called with preemption disabled ++ * ++ * Returns with preemption disabled. Note: preempt_count must be 1 ++ */ ++void __sched schedule_preempt_disabled(void) ++{ ++ sched_preempt_enable_no_resched(); ++ schedule(); ++ preempt_disable(); ++} ++ ++static void __sched notrace preempt_schedule_common(void) ++{ ++ do { ++ /* ++ * Because the function tracer can trace preempt_count_sub() ++ * and it also uses preempt_enable/disable_notrace(), if ++ * NEED_RESCHED is set, the preempt_enable_notrace() called ++ * by the function tracer will call this function again and ++ * cause infinite recursion. ++ * ++ * Preemption must be disabled here before the function ++ * tracer can trace. Break up preempt_disable() into two ++ * calls. One to disable preemption without fear of being ++ * traced. The other to still record the preemption latency, ++ * which can also be traced by the function tracer. ++ */ ++ preempt_disable_notrace(); ++ preempt_latency_start(1); ++ __schedule(true); ++ preempt_latency_stop(1); ++ preempt_enable_no_resched_notrace(); ++ ++ /* ++ * Check again in case we missed a preemption opportunity ++ * between schedule and now. ++ */ ++ } while (need_resched()); ++} ++ ++#ifdef CONFIG_PREEMPTION ++/* ++ * This is the entry point to schedule() from in-kernel preemption ++ * off of preempt_enable. ++ */ ++asmlinkage __visible void __sched notrace preempt_schedule(void) ++{ ++ /* ++ * If there is a non-zero preempt_count or interrupts are disabled, ++ * we do not want to preempt the current task. Just return.. ++ */ ++ if (likely(!preemptible())) ++ return; ++ ++ preempt_schedule_common(); ++} ++NOKPROBE_SYMBOL(preempt_schedule); ++EXPORT_SYMBOL(preempt_schedule); ++ ++/** ++ * preempt_schedule_notrace - preempt_schedule called by tracing ++ * ++ * The tracing infrastructure uses preempt_enable_notrace to prevent ++ * recursion and tracing preempt enabling caused by the tracing ++ * infrastructure itself. But as tracing can happen in areas coming ++ * from userspace or just about to enter userspace, a preempt enable ++ * can occur before user_exit() is called. This will cause the scheduler ++ * to be called when the system is still in usermode. ++ * ++ * To prevent this, the preempt_enable_notrace will use this function ++ * instead of preempt_schedule() to exit user context if needed before ++ * calling the scheduler. ++ */ ++asmlinkage __visible void __sched notrace preempt_schedule_notrace(void) ++{ ++ enum ctx_state prev_ctx; ++ ++ if (likely(!preemptible())) ++ return; ++ ++ do { ++ /* ++ * Because the function tracer can trace preempt_count_sub() ++ * and it also uses preempt_enable/disable_notrace(), if ++ * NEED_RESCHED is set, the preempt_enable_notrace() called ++ * by the function tracer will call this function again and ++ * cause infinite recursion. ++ * ++ * Preemption must be disabled here before the function ++ * tracer can trace. Break up preempt_disable() into two ++ * calls. One to disable preemption without fear of being ++ * traced. The other to still record the preemption latency, ++ * which can also be traced by the function tracer. ++ */ ++ preempt_disable_notrace(); ++ preempt_latency_start(1); ++ /* ++ * Needs preempt disabled in case user_exit() is traced ++ * and the tracer calls preempt_enable_notrace() causing ++ * an infinite recursion. ++ */ ++ prev_ctx = exception_enter(); ++ __schedule(true); ++ exception_exit(prev_ctx); ++ ++ preempt_latency_stop(1); ++ preempt_enable_no_resched_notrace(); ++ } while (need_resched()); ++} ++EXPORT_SYMBOL_GPL(preempt_schedule_notrace); ++ ++#endif /* CONFIG_PREEMPTION */ ++ ++/* ++ * This is the entry point to schedule() from kernel preemption ++ * off of irq context. ++ * Note, that this is called and return with irqs disabled. This will ++ * protect us against recursive calling from irq. ++ */ ++asmlinkage __visible void __sched preempt_schedule_irq(void) ++{ ++ enum ctx_state prev_state; ++ ++ /* Catch callers which need to be fixed */ ++ BUG_ON(preempt_count() || !irqs_disabled()); ++ ++ prev_state = exception_enter(); ++ ++ do { ++ preempt_disable(); ++ local_irq_enable(); ++ __schedule(true); ++ local_irq_disable(); ++ sched_preempt_enable_no_resched(); ++ } while (need_resched()); ++ ++ exception_exit(prev_state); ++} ++ ++int default_wake_function(wait_queue_entry_t *curr, unsigned mode, int wake_flags, ++ void *key) ++{ ++ return try_to_wake_up(curr->private, mode, wake_flags); ++} ++EXPORT_SYMBOL(default_wake_function); ++ ++static inline void check_task_changed(struct rq *rq, struct task_struct *p) ++{ ++ /* Trigger resched if task sched_prio has been modified. */ ++ if (task_on_rq_queued(p) && task_sched_prio(p) != p->bmq_idx) { ++ requeue_task(p, rq); ++ check_preempt_curr(rq); ++ } ++} ++ ++#ifdef CONFIG_RT_MUTEXES ++ ++static inline int __rt_effective_prio(struct task_struct *pi_task, int prio) ++{ ++ if (pi_task) ++ prio = min(prio, pi_task->prio); ++ ++ return prio; ++} ++ ++static inline int rt_effective_prio(struct task_struct *p, int prio) ++{ ++ struct task_struct *pi_task = rt_mutex_get_top_task(p); ++ ++ return __rt_effective_prio(pi_task, prio); ++} ++ ++/* ++ * rt_mutex_setprio - set the current priority of a task ++ * @p: task to boost ++ * @pi_task: donor task ++ * ++ * This function changes the 'effective' priority of a task. It does ++ * not touch ->normal_prio like __setscheduler(). ++ * ++ * Used by the rt_mutex code to implement priority inheritance ++ * logic. Call site only calls if the priority of the task changed. ++ */ ++void rt_mutex_setprio(struct task_struct *p, struct task_struct *pi_task) ++{ ++ int prio; ++ struct rq *rq; ++ raw_spinlock_t *lock; ++ ++ /* XXX used to be waiter->prio, not waiter->task->prio */ ++ prio = __rt_effective_prio(pi_task, p->normal_prio); ++ ++ /* ++ * If nothing changed; bail early. ++ */ ++ if (p->pi_top_task == pi_task && prio == p->prio) ++ return; ++ ++ rq = __task_access_lock(p, &lock); ++ /* ++ * Set under pi_lock && rq->lock, such that the value can be used under ++ * either lock. ++ * ++ * Note that there is loads of tricky to make this pointer cache work ++ * right. rt_mutex_slowunlock()+rt_mutex_postunlock() work together to ++ * ensure a task is de-boosted (pi_task is set to NULL) before the ++ * task is allowed to run again (and can exit). This ensures the pointer ++ * points to a blocked task -- which guaratees the task is present. ++ */ ++ p->pi_top_task = pi_task; ++ ++ /* ++ * For FIFO/RR we only need to set prio, if that matches we're done. ++ */ ++ if (prio == p->prio) ++ goto out_unlock; ++ ++ /* ++ * Idle task boosting is a nono in general. There is one ++ * exception, when PREEMPT_RT and NOHZ is active: ++ * ++ * The idle task calls get_next_timer_interrupt() and holds ++ * the timer wheel base->lock on the CPU and another CPU wants ++ * to access the timer (probably to cancel it). We can safely ++ * ignore the boosting request, as the idle CPU runs this code ++ * with interrupts disabled and will complete the lock ++ * protected section without being interrupted. So there is no ++ * real need to boost. ++ */ ++ if (unlikely(p == rq->idle)) { ++ WARN_ON(p != rq->curr); ++ WARN_ON(p->pi_blocked_on); ++ goto out_unlock; ++ } ++ ++ trace_sched_pi_setprio(p, pi_task); ++ p->prio = prio; ++ ++ check_task_changed(rq, p); ++out_unlock: ++ __task_access_unlock(p, lock); ++} ++#else ++static inline int rt_effective_prio(struct task_struct *p, int prio) ++{ ++ return prio; ++} ++#endif ++ ++void set_user_nice(struct task_struct *p, long nice) ++{ ++ unsigned long flags; ++ struct rq *rq; ++ raw_spinlock_t *lock; ++ ++ if (task_nice(p) == nice || nice < MIN_NICE || nice > MAX_NICE) ++ return; ++ /* ++ * We have to be careful, if called from sys_setpriority(), ++ * the task might be in the middle of scheduling on another CPU. ++ */ ++ raw_spin_lock_irqsave(&p->pi_lock, flags); ++ rq = __task_access_lock(p, &lock); ++ ++ p->static_prio = NICE_TO_PRIO(nice); ++ /* ++ * The RT priorities are set via sched_setscheduler(), but we still ++ * allow the 'normal' nice value to be set - but as expected ++ * it wont have any effect on scheduling until the task is ++ * not SCHED_NORMAL/SCHED_BATCH: ++ */ ++ if (task_has_rt_policy(p)) ++ goto out_unlock; ++ ++ p->prio = effective_prio(p); ++ check_task_changed(rq, p); ++out_unlock: ++ __task_access_unlock(p, lock); ++ raw_spin_unlock_irqrestore(&p->pi_lock, flags); ++} ++EXPORT_SYMBOL(set_user_nice); ++ ++/* ++ * can_nice - check if a task can reduce its nice value ++ * @p: task ++ * @nice: nice value ++ */ ++int can_nice(const struct task_struct *p, const int nice) ++{ ++ /* Convert nice value [19,-20] to rlimit style value [1,40] */ ++ int nice_rlim = nice_to_rlimit(nice); ++ ++ return (nice_rlim <= task_rlimit(p, RLIMIT_NICE) || ++ capable(CAP_SYS_NICE)); ++} ++ ++#ifdef __ARCH_WANT_SYS_NICE ++ ++/* ++ * sys_nice - change the priority of the current process. ++ * @increment: priority increment ++ * ++ * sys_setpriority is a more generic, but much slower function that ++ * does similar things. ++ */ ++SYSCALL_DEFINE1(nice, int, increment) ++{ ++ long nice, retval; ++ ++ /* ++ * Setpriority might change our priority at the same moment. ++ * We don't have to worry. Conceptually one call occurs first ++ * and we have a single winner. ++ */ ++ ++ increment = clamp(increment, -NICE_WIDTH, NICE_WIDTH); ++ nice = task_nice(current) + increment; ++ ++ nice = clamp_val(nice, MIN_NICE, MAX_NICE); ++ if (increment < 0 && !can_nice(current, nice)) ++ return -EPERM; ++ ++ retval = security_task_setnice(current, nice); ++ if (retval) ++ return retval; ++ ++ set_user_nice(current, nice); ++ return 0; ++} ++ ++#endif ++ ++/** ++ * task_prio - return the priority value of a given task. ++ * @p: the task in question. ++ * ++ * Return: The priority value as seen by users in /proc. ++ * RT tasks are offset by -100. Normal tasks are centered around 1, value goes ++ * from 0(SCHED_ISO) up to 82 (nice +19 SCHED_IDLE). ++ */ ++int task_prio(const struct task_struct *p) ++{ ++ if (p->prio < MAX_RT_PRIO) ++ return (p->prio - MAX_RT_PRIO); ++ return (p->prio - MAX_RT_PRIO + p->boost_prio); ++} ++ ++/** ++ * idle_cpu - is a given CPU idle currently? ++ * @cpu: the processor in question. ++ * ++ * Return: 1 if the CPU is currently idle. 0 otherwise. ++ */ ++int idle_cpu(int cpu) ++{ ++ return cpu_curr(cpu) == cpu_rq(cpu)->idle; ++} ++ ++/** ++ * idle_task - return the idle task for a given CPU. ++ * @cpu: the processor in question. ++ * ++ * Return: The idle task for the cpu @cpu. ++ */ ++struct task_struct *idle_task(int cpu) ++{ ++ return cpu_rq(cpu)->idle; ++} ++ ++/** ++ * find_process_by_pid - find a process with a matching PID value. ++ * @pid: the pid in question. ++ * ++ * The task of @pid, if found. %NULL otherwise. ++ */ ++static inline struct task_struct *find_process_by_pid(pid_t pid) ++{ ++ return pid ? find_task_by_vpid(pid) : current; ++} ++ ++/* ++ * sched_setparam() passes in -1 for its policy, to let the functions ++ * it calls know not to change it. ++ */ ++#define SETPARAM_POLICY -1 ++ ++static void __setscheduler_params(struct task_struct *p, ++ const struct sched_attr *attr) ++{ ++ int policy = attr->sched_policy; ++ ++ if (policy == SETPARAM_POLICY) ++ policy = p->policy; ++ ++ p->policy = policy; ++ ++ /* ++ * allow normal nice value to be set, but will not have any ++ * effect on scheduling until the task not SCHED_NORMAL/ ++ * SCHED_BATCH ++ */ ++ p->static_prio = NICE_TO_PRIO(attr->sched_nice); ++ ++ /* ++ * __sched_setscheduler() ensures attr->sched_priority == 0 when ++ * !rt_policy. Always setting this ensures that things like ++ * getparam()/getattr() don't report silly values for !rt tasks. ++ */ ++ p->rt_priority = attr->sched_priority; ++ p->normal_prio = normal_prio(p); ++} ++ ++/* Actually do priority change: must hold rq lock. */ ++static void __setscheduler(struct rq *rq, struct task_struct *p, ++ const struct sched_attr *attr, bool keep_boost) ++{ ++ __setscheduler_params(p, attr); ++ ++ /* ++ * Keep a potential priority boosting if called from ++ * sched_setscheduler(). ++ */ ++ p->prio = normal_prio(p); ++ if (keep_boost) ++ p->prio = rt_effective_prio(p, p->prio); ++} ++ ++/* ++ * check the target process has a UID that matches the current process's ++ */ ++static bool check_same_owner(struct task_struct *p) ++{ ++ const struct cred *cred = current_cred(), *pcred; ++ bool match; ++ ++ rcu_read_lock(); ++ pcred = __task_cred(p); ++ match = (uid_eq(cred->euid, pcred->euid) || ++ uid_eq(cred->euid, pcred->uid)); ++ rcu_read_unlock(); ++ return match; ++} ++ ++static int __sched_setscheduler(struct task_struct *p, ++ const struct sched_attr *attr, ++ bool user, bool pi) ++{ ++ const struct sched_attr dl_squash_attr = { ++ .size = sizeof(struct sched_attr), ++ .sched_policy = SCHED_FIFO, ++ .sched_nice = 0, ++ .sched_priority = 99, ++ }; ++ int newprio = MAX_RT_PRIO - 1 - attr->sched_priority; ++ int retval, oldpolicy = -1; ++ int policy = attr->sched_policy; ++ unsigned long flags; ++ struct rq *rq; ++ int reset_on_fork; ++ raw_spinlock_t *lock; ++ ++ /* The pi code expects interrupts enabled */ ++ BUG_ON(pi && in_interrupt()); ++ ++ /* ++ * BMQ supports SCHED_DEADLINE by squash it as prio 0 SCHED_FIFO ++ */ ++ if (unlikely(SCHED_DEADLINE == policy)) { ++ attr = &dl_squash_attr; ++ policy = attr->sched_policy; ++ newprio = MAX_RT_PRIO - 1 - attr->sched_priority; ++ } ++recheck: ++ /* Double check policy once rq lock held */ ++ if (policy < 0) { ++ reset_on_fork = p->sched_reset_on_fork; ++ policy = oldpolicy = p->policy; ++ } else { ++ reset_on_fork = !!(attr->sched_flags & SCHED_RESET_ON_FORK); ++ ++ if (policy > SCHED_IDLE) ++ return -EINVAL; ++ } ++ ++ if (attr->sched_flags & ~(SCHED_FLAG_ALL)) ++ return -EINVAL; ++ ++ /* ++ * Valid priorities for SCHED_FIFO and SCHED_RR are ++ * 1..MAX_USER_RT_PRIO-1, valid priority for SCHED_NORMAL and ++ * SCHED_BATCH and SCHED_IDLE is 0. ++ */ ++ if (attr->sched_priority < 0 || ++ (p->mm && attr->sched_priority > MAX_USER_RT_PRIO - 1) || ++ (!p->mm && attr->sched_priority > MAX_RT_PRIO - 1)) ++ return -EINVAL; ++ if ((SCHED_RR == policy || SCHED_FIFO == policy) != ++ (attr->sched_priority != 0)) ++ return -EINVAL; ++ ++ /* ++ * Allow unprivileged RT tasks to decrease priority: ++ */ ++ if (user && !capable(CAP_SYS_NICE)) { ++ if (SCHED_FIFO == policy || SCHED_RR == policy) { ++ unsigned long rlim_rtprio = ++ task_rlimit(p, RLIMIT_RTPRIO); ++ ++ /* Can't set/change the rt policy */ ++ if (policy != p->policy && !rlim_rtprio) ++ return -EPERM; ++ ++ /* Can't increase priority */ ++ if (attr->sched_priority > p->rt_priority && ++ attr->sched_priority > rlim_rtprio) ++ return -EPERM; ++ } ++ ++ /* Can't change other user's priorities */ ++ if (!check_same_owner(p)) ++ return -EPERM; ++ ++ /* Normal users shall not reset the sched_reset_on_fork flag */ ++ if (p->sched_reset_on_fork && !reset_on_fork) ++ return -EPERM; ++ } ++ ++ if (user) { ++ retval = security_task_setscheduler(p); ++ if (retval) ++ return retval; ++ } ++ ++ if (pi) ++ cpuset_read_lock(); ++ ++ /* ++ * Make sure no PI-waiters arrive (or leave) while we are ++ * changing the priority of the task: ++ */ ++ raw_spin_lock_irqsave(&p->pi_lock, flags); ++ ++ /* ++ * To be able to change p->policy safely, task_access_lock() ++ * must be called. ++ * IF use task_access_lock() here: ++ * For the task p which is not running, reading rq->stop is ++ * racy but acceptable as ->stop doesn't change much. ++ * An enhancemnet can be made to read rq->stop saftly. ++ */ ++ rq = __task_access_lock(p, &lock); ++ ++ /* ++ * Changing the policy of the stop threads its a very bad idea ++ */ ++ if (p == rq->stop) { ++ retval = -EINVAL; ++ goto unlock; ++ } ++ ++ /* ++ * If not changing anything there's no need to proceed further: ++ */ ++ if (unlikely(policy == p->policy)) { ++ if (rt_policy(policy) && attr->sched_priority != p->rt_priority) ++ goto change; ++ if (!rt_policy(policy) && ++ NICE_TO_PRIO(attr->sched_nice) != p->static_prio) ++ goto change; ++ ++ p->sched_reset_on_fork = reset_on_fork; ++ retval = 0; ++ goto unlock; ++ } ++change: ++ ++ /* Re-check policy now with rq lock held */ ++ if (unlikely(oldpolicy != -1 && oldpolicy != p->policy)) { ++ policy = oldpolicy = -1; ++ __task_access_unlock(p, lock); ++ raw_spin_unlock_irqrestore(&p->pi_lock, flags); ++ if (pi) ++ cpuset_read_unlock(); ++ goto recheck; ++ } ++ ++ p->sched_reset_on_fork = reset_on_fork; ++ ++ if (pi) { ++ /* ++ * Take priority boosted tasks into account. If the new ++ * effective priority is unchanged, we just store the new ++ * normal parameters and do not touch the scheduler class and ++ * the runqueue. This will be done when the task deboost ++ * itself. ++ */ ++ if (rt_effective_prio(p, newprio) == p->prio) { ++ __setscheduler_params(p, attr); ++ retval = 0; ++ goto unlock; ++ } ++ } ++ ++ __setscheduler(rq, p, attr, pi); ++ ++ check_task_changed(rq, p); ++ ++ /* Avoid rq from going away on us: */ ++ preempt_disable(); ++ __task_access_unlock(p, lock); ++ raw_spin_unlock_irqrestore(&p->pi_lock, flags); ++ ++ if (pi) { ++ cpuset_read_unlock(); ++ rt_mutex_adjust_pi(p); ++ } ++ ++ preempt_enable(); ++ ++ return 0; ++ ++unlock: ++ __task_access_unlock(p, lock); ++ raw_spin_unlock_irqrestore(&p->pi_lock, flags); ++ if (pi) ++ cpuset_read_unlock(); ++ return retval; ++} ++ ++static int _sched_setscheduler(struct task_struct *p, int policy, ++ const struct sched_param *param, bool check) ++{ ++ struct sched_attr attr = { ++ .sched_policy = policy, ++ .sched_priority = param->sched_priority, ++ .sched_nice = PRIO_TO_NICE(p->static_prio), ++ }; ++ ++ /* Fixup the legacy SCHED_RESET_ON_FORK hack. */ ++ if ((policy != SETPARAM_POLICY) && (policy & SCHED_RESET_ON_FORK)) { ++ attr.sched_flags |= SCHED_FLAG_RESET_ON_FORK; ++ policy &= ~SCHED_RESET_ON_FORK; ++ attr.sched_policy = policy; ++ } ++ ++ return __sched_setscheduler(p, &attr, check, true); ++} ++ ++/** ++ * sched_setscheduler - change the scheduling policy and/or RT priority of a thread. ++ * @p: the task in question. ++ * @policy: new policy. ++ * @param: structure containing the new RT priority. ++ * ++ * Return: 0 on success. An error code otherwise. ++ * ++ * NOTE that the task may be already dead. ++ */ ++int sched_setscheduler(struct task_struct *p, int policy, ++ const struct sched_param *param) ++{ ++ return _sched_setscheduler(p, policy, param, true); ++} ++ ++EXPORT_SYMBOL_GPL(sched_setscheduler); ++ ++int sched_setattr(struct task_struct *p, const struct sched_attr *attr) ++{ ++ return __sched_setscheduler(p, attr, true, true); ++} ++EXPORT_SYMBOL_GPL(sched_setattr); ++ ++int sched_setattr_nocheck(struct task_struct *p, const struct sched_attr *attr) ++{ ++ return __sched_setscheduler(p, attr, false, true); ++} ++ ++/** ++ * sched_setscheduler_nocheck - change the scheduling policy and/or RT priority of a thread from kernelspace. ++ * @p: the task in question. ++ * @policy: new policy. ++ * @param: structure containing the new RT priority. ++ * ++ * Just like sched_setscheduler, only don't bother checking if the ++ * current context has permission. For example, this is needed in ++ * stop_machine(): we create temporary high priority worker threads, ++ * but our caller might not have that capability. ++ * ++ * Return: 0 on success. An error code otherwise. ++ */ ++int sched_setscheduler_nocheck(struct task_struct *p, int policy, ++ const struct sched_param *param) ++{ ++ return _sched_setscheduler(p, policy, param, false); ++} ++EXPORT_SYMBOL_GPL(sched_setscheduler_nocheck); ++ ++static int ++do_sched_setscheduler(pid_t pid, int policy, struct sched_param __user *param) ++{ ++ struct sched_param lparam; ++ struct task_struct *p; ++ int retval; ++ ++ if (!param || pid < 0) ++ return -EINVAL; ++ if (copy_from_user(&lparam, param, sizeof(struct sched_param))) ++ return -EFAULT; ++ ++ rcu_read_lock(); ++ retval = -ESRCH; ++ p = find_process_by_pid(pid); ++ if (likely(p)) ++ get_task_struct(p); ++ rcu_read_unlock(); ++ ++ if (likely(p)) { ++ retval = sched_setscheduler(p, policy, &lparam); ++ put_task_struct(p); ++ } ++ ++ return retval; ++} ++ ++/* ++ * Mimics kernel/events/core.c perf_copy_attr(). ++ */ ++static int sched_copy_attr(struct sched_attr __user *uattr, struct sched_attr *attr) ++{ ++ u32 size; ++ int ret; ++ ++ /* Zero the full structure, so that a short copy will be nice: */ ++ memset(attr, 0, sizeof(*attr)); ++ ++ ret = get_user(size, &uattr->size); ++ if (ret) ++ return ret; ++ ++ /* ABI compatibility quirk: */ ++ if (!size) ++ size = SCHED_ATTR_SIZE_VER0; ++ ++ if (size < SCHED_ATTR_SIZE_VER0 || size > PAGE_SIZE) ++ goto err_size; ++ ++ ret = copy_struct_from_user(attr, sizeof(*attr), uattr, size); ++ if (ret) { ++ if (ret == -E2BIG) ++ goto err_size; ++ return ret; ++ } ++ ++ /* ++ * XXX: Do we want to be lenient like existing syscalls; or do we want ++ * to be strict and return an error on out-of-bounds values? ++ */ ++ attr->sched_nice = clamp(attr->sched_nice, -20, 19); ++ ++ /* sched/core.c uses zero here but we already know ret is zero */ ++ return 0; ++ ++err_size: ++ put_user(sizeof(*attr), &uattr->size); ++ return -E2BIG; ++} ++ ++/** ++ * sys_sched_setscheduler - set/change the scheduler policy and RT priority ++ * @pid: the pid in question. ++ * @policy: new policy. ++ * ++ * Return: 0 on success. An error code otherwise. ++ * @param: structure containing the new RT priority. ++ */ ++SYSCALL_DEFINE3(sched_setscheduler, pid_t, pid, int, policy, struct sched_param __user *, param) ++{ ++ if (policy < 0) ++ return -EINVAL; ++ ++ return do_sched_setscheduler(pid, policy, param); ++} ++ ++/** ++ * sys_sched_setparam - set/change the RT priority of a thread ++ * @pid: the pid in question. ++ * @param: structure containing the new RT priority. ++ * ++ * Return: 0 on success. An error code otherwise. ++ */ ++SYSCALL_DEFINE2(sched_setparam, pid_t, pid, struct sched_param __user *, param) ++{ ++ return do_sched_setscheduler(pid, SETPARAM_POLICY, param); ++} ++ ++/** ++ * sys_sched_setattr - same as above, but with extended sched_attr ++ * @pid: the pid in question. ++ * @uattr: structure containing the extended parameters. ++ */ ++SYSCALL_DEFINE3(sched_setattr, pid_t, pid, struct sched_attr __user *, uattr, ++ unsigned int, flags) ++{ ++ struct sched_attr attr; ++ struct task_struct *p; ++ int retval; ++ ++ if (!uattr || pid < 0 || flags) ++ return -EINVAL; ++ ++ retval = sched_copy_attr(uattr, &attr); ++ if (retval) ++ return retval; ++ ++ if ((int)attr.sched_policy < 0) ++ return -EINVAL; ++ ++ rcu_read_lock(); ++ retval = -ESRCH; ++ p = find_process_by_pid(pid); ++ if (p != NULL) ++ retval = sched_setattr(p, &attr); ++ rcu_read_unlock(); ++ ++ return retval; ++} ++ ++/** ++ * sys_sched_getscheduler - get the policy (scheduling class) of a thread ++ * @pid: the pid in question. ++ * ++ * Return: On success, the policy of the thread. Otherwise, a negative error ++ * code. ++ */ ++SYSCALL_DEFINE1(sched_getscheduler, pid_t, pid) ++{ ++ struct task_struct *p; ++ int retval = -EINVAL; ++ ++ if (pid < 0) ++ goto out_nounlock; ++ ++ retval = -ESRCH; ++ rcu_read_lock(); ++ p = find_process_by_pid(pid); ++ if (p) { ++ retval = security_task_getscheduler(p); ++ if (!retval) ++ retval = p->policy; ++ } ++ rcu_read_unlock(); ++ ++out_nounlock: ++ return retval; ++} ++ ++/** ++ * sys_sched_getscheduler - get the RT priority of a thread ++ * @pid: the pid in question. ++ * @param: structure containing the RT priority. ++ * ++ * Return: On success, 0 and the RT priority is in @param. Otherwise, an error ++ * code. ++ */ ++SYSCALL_DEFINE2(sched_getparam, pid_t, pid, struct sched_param __user *, param) ++{ ++ struct sched_param lp = { .sched_priority = 0 }; ++ struct task_struct *p; ++ int retval = -EINVAL; ++ ++ if (!param || pid < 0) ++ goto out_nounlock; ++ ++ rcu_read_lock(); ++ p = find_process_by_pid(pid); ++ retval = -ESRCH; ++ if (!p) ++ goto out_unlock; ++ ++ retval = security_task_getscheduler(p); ++ if (retval) ++ goto out_unlock; ++ ++ if (task_has_rt_policy(p)) ++ lp.sched_priority = p->rt_priority; ++ rcu_read_unlock(); ++ ++ /* ++ * This one might sleep, we cannot do it with a spinlock held ... ++ */ ++ retval = copy_to_user(param, &lp, sizeof(*param)) ? -EFAULT : 0; ++ ++out_nounlock: ++ return retval; ++ ++out_unlock: ++ rcu_read_unlock(); ++ return retval; ++} ++ ++/* ++ * Copy the kernel size attribute structure (which might be larger ++ * than what user-space knows about) to user-space. ++ * ++ * Note that all cases are valid: user-space buffer can be larger or ++ * smaller than the kernel-space buffer. The usual case is that both ++ * have the same size. ++ */ ++static int ++sched_attr_copy_to_user(struct sched_attr __user *uattr, ++ struct sched_attr *kattr, ++ unsigned int usize) ++{ ++ unsigned int ksize = sizeof(*kattr); ++ ++ if (!access_ok(uattr, usize)) ++ return -EFAULT; ++ ++ /* ++ * sched_getattr() ABI forwards and backwards compatibility: ++ * ++ * If usize == ksize then we just copy everything to user-space and all is good. ++ * ++ * If usize < ksize then we only copy as much as user-space has space for, ++ * this keeps ABI compatibility as well. We skip the rest. ++ * ++ * If usize > ksize then user-space is using a newer version of the ABI, ++ * which part the kernel doesn't know about. Just ignore it - tooling can ++ * detect the kernel's knowledge of attributes from the attr->size value ++ * which is set to ksize in this case. ++ */ ++ kattr->size = min(usize, ksize); ++ ++ if (copy_to_user(uattr, kattr, kattr->size)) ++ return -EFAULT; ++ ++ return 0; ++} ++ ++/** ++ * sys_sched_getattr - similar to sched_getparam, but with sched_attr ++ * @pid: the pid in question. ++ * @uattr: structure containing the extended parameters. ++ * @usize: sizeof(attr) for fwd/bwd comp. ++ * @flags: for future extension. ++ */ ++SYSCALL_DEFINE4(sched_getattr, pid_t, pid, struct sched_attr __user *, uattr, ++ unsigned int, usize, unsigned int, flags) ++{ ++ struct sched_attr kattr = { }; ++ struct task_struct *p; ++ int retval; ++ ++ if (!uattr || pid < 0 || usize > PAGE_SIZE || ++ usize < SCHED_ATTR_SIZE_VER0 || flags) ++ return -EINVAL; ++ ++ rcu_read_lock(); ++ p = find_process_by_pid(pid); ++ retval = -ESRCH; ++ if (!p) ++ goto out_unlock; ++ ++ retval = security_task_getscheduler(p); ++ if (retval) ++ goto out_unlock; ++ ++ kattr.sched_policy = p->policy; ++ if (p->sched_reset_on_fork) ++ kattr.sched_flags |= SCHED_FLAG_RESET_ON_FORK; ++ if (task_has_rt_policy(p)) ++ kattr.sched_priority = p->rt_priority; ++ else ++ kattr.sched_nice = task_nice(p); ++ ++#ifdef CONFIG_UCLAMP_TASK ++ kattr.sched_util_min = p->uclamp_req[UCLAMP_MIN].value; ++ kattr.sched_util_max = p->uclamp_req[UCLAMP_MAX].value; ++#endif ++ ++ rcu_read_unlock(); ++ ++ return sched_attr_copy_to_user(uattr, &kattr, usize); ++ ++out_unlock: ++ rcu_read_unlock(); ++ return retval; ++} ++ ++long sched_setaffinity(pid_t pid, const struct cpumask *in_mask) ++{ ++ cpumask_var_t cpus_allowed, new_mask; ++ struct task_struct *p; ++ int retval; ++ ++ get_online_cpus(); ++ rcu_read_lock(); ++ ++ p = find_process_by_pid(pid); ++ if (!p) { ++ rcu_read_unlock(); ++ put_online_cpus(); ++ return -ESRCH; ++ } ++ ++ /* Prevent p going away */ ++ get_task_struct(p); ++ rcu_read_unlock(); ++ ++ if (p->flags & PF_NO_SETAFFINITY) { ++ retval = -EINVAL; ++ goto out_put_task; ++ } ++ if (!alloc_cpumask_var(&cpus_allowed, GFP_KERNEL)) { ++ retval = -ENOMEM; ++ goto out_put_task; ++ } ++ if (!alloc_cpumask_var(&new_mask, GFP_KERNEL)) { ++ retval = -ENOMEM; ++ goto out_free_cpus_allowed; ++ } ++ retval = -EPERM; ++ if (!check_same_owner(p)) { ++ rcu_read_lock(); ++ if (!ns_capable(__task_cred(p)->user_ns, CAP_SYS_NICE)) { ++ rcu_read_unlock(); ++ goto out_unlock; ++ } ++ rcu_read_unlock(); ++ } ++ ++ retval = security_task_setscheduler(p); ++ if (retval) ++ goto out_unlock; ++ ++ cpuset_cpus_allowed(p, cpus_allowed); ++ cpumask_and(new_mask, in_mask, cpus_allowed); ++again: ++ retval = __set_cpus_allowed_ptr(p, new_mask, true); ++ ++ if (!retval) { ++ cpuset_cpus_allowed(p, cpus_allowed); ++ if (!cpumask_subset(new_mask, cpus_allowed)) { ++ /* ++ * We must have raced with a concurrent cpuset ++ * update. Just reset the cpus_allowed to the ++ * cpuset's cpus_allowed ++ */ ++ cpumask_copy(new_mask, cpus_allowed); ++ goto again; ++ } ++ } ++out_unlock: ++ free_cpumask_var(new_mask); ++out_free_cpus_allowed: ++ free_cpumask_var(cpus_allowed); ++out_put_task: ++ put_task_struct(p); ++ put_online_cpus(); ++ return retval; ++} ++ ++static int get_user_cpu_mask(unsigned long __user *user_mask_ptr, unsigned len, ++ struct cpumask *new_mask) ++{ ++ if (len < cpumask_size()) ++ cpumask_clear(new_mask); ++ else if (len > cpumask_size()) ++ len = cpumask_size(); ++ ++ return copy_from_user(new_mask, user_mask_ptr, len) ? -EFAULT : 0; ++} ++ ++/** ++ * sys_sched_setaffinity - set the CPU affinity of a process ++ * @pid: pid of the process ++ * @len: length in bytes of the bitmask pointed to by user_mask_ptr ++ * @user_mask_ptr: user-space pointer to the new CPU mask ++ * ++ * Return: 0 on success. An error code otherwise. ++ */ ++SYSCALL_DEFINE3(sched_setaffinity, pid_t, pid, unsigned int, len, ++ unsigned long __user *, user_mask_ptr) ++{ ++ cpumask_var_t new_mask; ++ int retval; ++ ++ if (!alloc_cpumask_var(&new_mask, GFP_KERNEL)) ++ return -ENOMEM; ++ ++ retval = get_user_cpu_mask(user_mask_ptr, len, new_mask); ++ if (retval == 0) ++ retval = sched_setaffinity(pid, new_mask); ++ free_cpumask_var(new_mask); ++ return retval; ++} ++ ++long sched_getaffinity(pid_t pid, cpumask_t *mask) ++{ ++ struct task_struct *p; ++ raw_spinlock_t *lock; ++ unsigned long flags; ++ int retval; ++ ++ rcu_read_lock(); ++ ++ retval = -ESRCH; ++ p = find_process_by_pid(pid); ++ if (!p) ++ goto out_unlock; ++ ++ retval = security_task_getscheduler(p); ++ if (retval) ++ goto out_unlock; ++ ++ task_access_lock_irqsave(p, &lock, &flags); ++ cpumask_and(mask, &p->cpus_mask, cpu_active_mask); ++ task_access_unlock_irqrestore(p, lock, &flags); ++ ++out_unlock: ++ rcu_read_unlock(); ++ ++ return retval; ++} ++ ++/** ++ * sys_sched_getaffinity - get the CPU affinity of a process ++ * @pid: pid of the process ++ * @len: length in bytes of the bitmask pointed to by user_mask_ptr ++ * @user_mask_ptr: user-space pointer to hold the current CPU mask ++ * ++ * Return: size of CPU mask copied to user_mask_ptr on success. An ++ * error code otherwise. ++ */ ++SYSCALL_DEFINE3(sched_getaffinity, pid_t, pid, unsigned int, len, ++ unsigned long __user *, user_mask_ptr) ++{ ++ int ret; ++ cpumask_var_t mask; ++ ++ if ((len * BITS_PER_BYTE) < nr_cpu_ids) ++ return -EINVAL; ++ if (len & (sizeof(unsigned long)-1)) ++ return -EINVAL; ++ ++ if (!alloc_cpumask_var(&mask, GFP_KERNEL)) ++ return -ENOMEM; ++ ++ ret = sched_getaffinity(pid, mask); ++ if (ret == 0) { ++ unsigned int retlen = min_t(size_t, len, cpumask_size()); ++ ++ if (copy_to_user(user_mask_ptr, mask, retlen)) ++ ret = -EFAULT; ++ else ++ ret = retlen; ++ } ++ free_cpumask_var(mask); ++ ++ return ret; ++} ++ ++/** ++ * sys_sched_yield - yield the current processor to other threads. ++ * ++ * This function yields the current CPU to other tasks. It does this by ++ * scheduling away the current task. If it still has the earliest deadline ++ * it will be scheduled again as the next task. ++ * ++ * Return: 0. ++ */ ++static void do_sched_yield(void) ++{ ++ struct rq *rq; ++ struct rq_flags rf; ++ ++ if (!sched_yield_type) ++ return; ++ ++ rq = this_rq_lock_irq(&rf); ++ ++ schedstat_inc(rq->yld_count); ++ ++ if (1 == sched_yield_type) { ++ if (!rt_task(current)) { ++ current->boost_prio = MAX_PRIORITY_ADJ; ++ requeue_task(current, rq); ++ } ++ } else if (2 == sched_yield_type) { ++ if (rq->nr_running > 1) ++ rq->skip = current; ++ } ++ ++ /* ++ * Since we are going to call schedule() anyway, there's ++ * no need to preempt or enable interrupts: ++ */ ++ preempt_disable(); ++ raw_spin_unlock(&rq->lock); ++ sched_preempt_enable_no_resched(); ++ ++ schedule(); ++} ++ ++SYSCALL_DEFINE0(sched_yield) ++{ ++ do_sched_yield(); ++ return 0; ++} ++ ++#ifndef CONFIG_PREEMPTION ++int __sched _cond_resched(void) ++{ ++ if (should_resched(0)) { ++ preempt_schedule_common(); ++ return 1; ++ } ++ rcu_all_qs(); ++ return 0; ++} ++EXPORT_SYMBOL(_cond_resched); ++#endif ++ ++/* ++ * __cond_resched_lock() - if a reschedule is pending, drop the given lock, ++ * call schedule, and on return reacquire the lock. ++ * ++ * This works OK both with and without CONFIG_PREEMPTION. We do strange low-level ++ * operations here to prevent schedule() from being called twice (once via ++ * spin_unlock(), once by hand). ++ */ ++int __cond_resched_lock(spinlock_t *lock) ++{ ++ int resched = should_resched(PREEMPT_LOCK_OFFSET); ++ int ret = 0; ++ ++ lockdep_assert_held(lock); ++ ++ if (spin_needbreak(lock) || resched) { ++ spin_unlock(lock); ++ if (resched) ++ preempt_schedule_common(); ++ else ++ cpu_relax(); ++ ret = 1; ++ spin_lock(lock); ++ } ++ return ret; ++} ++EXPORT_SYMBOL(__cond_resched_lock); ++ ++/** ++ * yield - yield the current processor to other threads. ++ * ++ * Do not ever use this function, there's a 99% chance you're doing it wrong. ++ * ++ * The scheduler is at all times free to pick the calling task as the most ++ * eligible task to run, if removing the yield() call from your code breaks ++ * it, its already broken. ++ * ++ * Typical broken usage is: ++ * ++ * while (!event) ++ * yield(); ++ * ++ * where one assumes that yield() will let 'the other' process run that will ++ * make event true. If the current task is a SCHED_FIFO task that will never ++ * happen. Never use yield() as a progress guarantee!! ++ * ++ * If you want to use yield() to wait for something, use wait_event(). ++ * If you want to use yield() to be 'nice' for others, use cond_resched(). ++ * If you still want to use yield(), do not! ++ */ ++void __sched yield(void) ++{ ++ set_current_state(TASK_RUNNING); ++ do_sched_yield(); ++} ++EXPORT_SYMBOL(yield); ++ ++/** ++ * yield_to - yield the current processor to another thread in ++ * your thread group, or accelerate that thread toward the ++ * processor it's on. ++ * @p: target task ++ * @preempt: whether task preemption is allowed or not ++ * ++ * It's the caller's job to ensure that the target task struct ++ * can't go away on us before we can do any checks. ++ * ++ * In BMQ, yield_to is not supported. ++ * ++ * Return: ++ * true (>0) if we indeed boosted the target task. ++ * false (0) if we failed to boost the target. ++ * -ESRCH if there's no task to yield to. ++ */ ++int __sched yield_to(struct task_struct *p, bool preempt) ++{ ++ return 0; ++} ++EXPORT_SYMBOL_GPL(yield_to); ++ ++int io_schedule_prepare(void) ++{ ++ int old_iowait = current->in_iowait; ++ ++ current->in_iowait = 1; ++ blk_schedule_flush_plug(current); ++ ++ return old_iowait; ++} ++ ++void io_schedule_finish(int token) ++{ ++ current->in_iowait = token; ++} ++ ++/* ++ * This task is about to go to sleep on IO. Increment rq->nr_iowait so ++ * that process accounting knows that this is a task in IO wait state. ++ * ++ * But don't do that if it is a deliberate, throttling IO wait (this task ++ * has set its backing_dev_info: the queue against which it should throttle) ++ */ ++ ++long __sched io_schedule_timeout(long timeout) ++{ ++ int token; ++ long ret; ++ ++ token = io_schedule_prepare(); ++ ret = schedule_timeout(timeout); ++ io_schedule_finish(token); ++ ++ return ret; ++} ++EXPORT_SYMBOL(io_schedule_timeout); ++ ++void __sched io_schedule(void) ++{ ++ int token; ++ ++ token = io_schedule_prepare(); ++ schedule(); ++ io_schedule_finish(token); ++} ++EXPORT_SYMBOL(io_schedule); ++ ++/** ++ * sys_sched_get_priority_max - return maximum RT priority. ++ * @policy: scheduling class. ++ * ++ * Return: On success, this syscall returns the maximum ++ * rt_priority that can be used by a given scheduling class. ++ * On failure, a negative error code is returned. ++ */ ++SYSCALL_DEFINE1(sched_get_priority_max, int, policy) ++{ ++ int ret = -EINVAL; ++ ++ switch (policy) { ++ case SCHED_FIFO: ++ case SCHED_RR: ++ ret = MAX_USER_RT_PRIO-1; ++ break; ++ case SCHED_NORMAL: ++ case SCHED_BATCH: ++ case SCHED_IDLE: ++ ret = 0; ++ break; ++ } ++ return ret; ++} ++ ++/** ++ * sys_sched_get_priority_min - return minimum RT priority. ++ * @policy: scheduling class. ++ * ++ * Return: On success, this syscall returns the minimum ++ * rt_priority that can be used by a given scheduling class. ++ * On failure, a negative error code is returned. ++ */ ++SYSCALL_DEFINE1(sched_get_priority_min, int, policy) ++{ ++ int ret = -EINVAL; ++ ++ switch (policy) { ++ case SCHED_FIFO: ++ case SCHED_RR: ++ ret = 1; ++ break; ++ case SCHED_NORMAL: ++ case SCHED_BATCH: ++ case SCHED_IDLE: ++ ret = 0; ++ break; ++ } ++ return ret; ++} ++ ++static int sched_rr_get_interval(pid_t pid, struct timespec64 *t) ++{ ++ struct task_struct *p; ++ int retval; ++ ++ if (pid < 0) ++ return -EINVAL; ++ ++ retval = -ESRCH; ++ rcu_read_lock(); ++ p = find_process_by_pid(pid); ++ if (!p) ++ goto out_unlock; ++ ++ retval = security_task_getscheduler(p); ++ if (retval) ++ goto out_unlock; ++ rcu_read_unlock(); ++ ++ *t = ns_to_timespec64(sched_timeslice_ns); ++ return 0; ++ ++out_unlock: ++ rcu_read_unlock(); ++ return retval; ++} ++ ++/** ++ * sys_sched_rr_get_interval - return the default timeslice of a process. ++ * @pid: pid of the process. ++ * @interval: userspace pointer to the timeslice value. ++ * ++ * ++ * Return: On success, 0 and the timeslice is in @interval. Otherwise, ++ * an error code. ++ */ ++SYSCALL_DEFINE2(sched_rr_get_interval, pid_t, pid, ++ struct __kernel_timespec __user *, interval) ++{ ++ struct timespec64 t; ++ int retval = sched_rr_get_interval(pid, &t); ++ ++ if (retval == 0) ++ retval = put_timespec64(&t, interval); ++ ++ return retval; ++} ++ ++#ifdef CONFIG_COMPAT_32BIT_TIME ++SYSCALL_DEFINE2(sched_rr_get_interval_time32, pid_t, pid, ++ struct old_timespec32 __user *, interval) ++{ ++ struct timespec64 t; ++ int retval = sched_rr_get_interval(pid, &t); ++ ++ if (retval == 0) ++ retval = put_old_timespec32(&t, interval); ++ return retval; ++} ++#endif ++ ++void sched_show_task(struct task_struct *p) ++{ ++ unsigned long free = 0; ++ int ppid; ++ ++ if (!try_get_task_stack(p)) ++ return; ++ ++ printk(KERN_INFO "%-15.15s %c", p->comm, task_state_to_char(p)); ++ ++ if (p->state == TASK_RUNNING) ++ printk(KERN_CONT " running task "); ++#ifdef CONFIG_DEBUG_STACK_USAGE ++ free = stack_not_used(p); ++#endif ++ ppid = 0; ++ rcu_read_lock(); ++ if (pid_alive(p)) ++ ppid = task_pid_nr(rcu_dereference(p->real_parent)); ++ rcu_read_unlock(); ++ printk(KERN_CONT "%5lu %5d %6d 0x%08lx\n", free, ++ task_pid_nr(p), ppid, ++ (unsigned long)task_thread_info(p)->flags); ++ ++ print_worker_info(KERN_INFO, p); ++ show_stack(p, NULL); ++ put_task_stack(p); ++} ++EXPORT_SYMBOL_GPL(sched_show_task); ++ ++static inline bool ++state_filter_match(unsigned long state_filter, struct task_struct *p) ++{ ++ /* no filter, everything matches */ ++ if (!state_filter) ++ return true; ++ ++ /* filter, but doesn't match */ ++ if (!(p->state & state_filter)) ++ return false; ++ ++ /* ++ * When looking for TASK_UNINTERRUPTIBLE skip TASK_IDLE (allows ++ * TASK_KILLABLE). ++ */ ++ if (state_filter == TASK_UNINTERRUPTIBLE && p->state == TASK_IDLE) ++ return false; ++ ++ return true; ++} ++ ++ ++void show_state_filter(unsigned long state_filter) ++{ ++ struct task_struct *g, *p; ++ ++#if BITS_PER_LONG == 32 ++ printk(KERN_INFO ++ " task PC stack pid father\n"); ++#else ++ printk(KERN_INFO ++ " task PC stack pid father\n"); ++#endif ++ rcu_read_lock(); ++ for_each_process_thread(g, p) { ++ /* ++ * reset the NMI-timeout, listing all files on a slow ++ * console might take a lot of time: ++ * Also, reset softlockup watchdogs on all CPUs, because ++ * another CPU might be blocked waiting for us to process ++ * an IPI. ++ */ ++ touch_nmi_watchdog(); ++ touch_all_softlockup_watchdogs(); ++ if (state_filter_match(state_filter, p)) ++ sched_show_task(p); ++ } ++ ++#ifdef CONFIG_SCHED_DEBUG ++ /* TODO: BMQ should support this ++ if (!state_filter) ++ sysrq_sched_debug_show(); ++ */ ++#endif ++ rcu_read_unlock(); ++ /* ++ * Only show locks if all tasks are dumped: ++ */ ++ if (!state_filter) ++ debug_show_all_locks(); ++} ++ ++void dump_cpu_task(int cpu) ++{ ++ pr_info("Task dump for CPU %d:\n", cpu); ++ sched_show_task(cpu_curr(cpu)); ++} ++ ++/** ++ * init_idle - set up an idle thread for a given CPU ++ * @idle: task in question ++ * @cpu: CPU the idle task belongs to ++ * ++ * NOTE: this function does not set the idle thread's NEED_RESCHED ++ * flag, to make booting more robust. ++ */ ++void init_idle(struct task_struct *idle, int cpu) ++{ ++ struct rq *rq = cpu_rq(cpu); ++ unsigned long flags; ++ ++ __sched_fork(0, idle); ++ ++ raw_spin_lock_irqsave(&idle->pi_lock, flags); ++ raw_spin_lock(&rq->lock); ++ update_rq_clock(rq); ++ ++ idle->last_ran = rq->clock_task; ++ idle->state = TASK_RUNNING; ++ idle->flags |= PF_IDLE; ++ /* Setting prio to illegal value shouldn't matter as it will never be de/enqueued */ ++ idle->prio = MAX_PRIO; ++ idle->bmq_idx = IDLE_TASK_SCHED_PRIO; ++ bmq_init_idle(&rq->queue, idle); ++ ++ kasan_unpoison_task_stack(idle); ++ ++#ifdef CONFIG_SMP ++ /* ++ * It's possible that init_idle() gets called multiple times on a task, ++ * in that case do_set_cpus_allowed() will not do the right thing. ++ * ++ * And since this is boot we can forgo the serialisation. ++ */ ++ set_cpus_allowed_common(idle, cpumask_of(cpu)); ++#endif ++ ++ /* Silence PROVE_RCU */ ++ rcu_read_lock(); ++ __set_task_cpu(idle, cpu); ++ rcu_read_unlock(); ++ ++ rq->idle = idle; ++ rcu_assign_pointer(rq->curr, idle); ++ idle->on_cpu = 1; ++ ++ raw_spin_unlock(&rq->lock); ++ raw_spin_unlock_irqrestore(&idle->pi_lock, flags); ++ ++ /* Set the preempt count _outside_ the spinlocks! */ ++ init_idle_preempt_count(idle, cpu); ++ ++ ftrace_graph_init_idle_task(idle, cpu); ++ vtime_init_idle(idle, cpu); ++#ifdef CONFIG_SMP ++ sprintf(idle->comm, "%s/%d", INIT_TASK_COMM, cpu); ++#endif ++} ++ ++#ifdef CONFIG_SMP ++ ++int cpuset_cpumask_can_shrink(const struct cpumask __maybe_unused *cur, ++ const struct cpumask __maybe_unused *trial) ++{ ++ return 1; ++} ++ ++int task_can_attach(struct task_struct *p, ++ const struct cpumask *cs_cpus_allowed) ++{ ++ int ret = 0; ++ ++ /* ++ * Kthreads which disallow setaffinity shouldn't be moved ++ * to a new cpuset; we don't want to change their CPU ++ * affinity and isolating such threads by their set of ++ * allowed nodes is unnecessary. Thus, cpusets are not ++ * applicable for such threads. This prevents checking for ++ * success of set_cpus_allowed_ptr() on all attached tasks ++ * before cpus_mask may be changed. ++ */ ++ if (p->flags & PF_NO_SETAFFINITY) ++ ret = -EINVAL; ++ ++ return ret; ++} ++ ++bool sched_smp_initialized __read_mostly; ++ ++#ifdef CONFIG_HOTPLUG_CPU ++/* ++ * Ensures that the idle task is using init_mm right before its CPU goes ++ * offline. ++ */ ++void idle_task_exit(void) ++{ ++ struct mm_struct *mm = current->active_mm; ++ ++ BUG_ON(cpu_online(smp_processor_id())); ++ ++ if (mm != &init_mm) { ++ switch_mm(mm, &init_mm, current); ++ current->active_mm = &init_mm; ++ finish_arch_post_lock_switch(); ++ } ++ mmdrop(mm); ++} ++ ++/* ++ * Migrate all tasks from the rq, sleeping tasks will be migrated by ++ * try_to_wake_up()->select_task_rq(). ++ * ++ * Called with rq->lock held even though we'er in stop_machine() and ++ * there's no concurrency possible, we hold the required locks anyway ++ * because of lock validation efforts. ++ */ ++static void migrate_tasks(struct rq *dead_rq) ++{ ++ struct rq *rq = dead_rq; ++ struct task_struct *p, *stop = rq->stop; ++ int count = 0; ++ ++ /* ++ * Fudge the rq selection such that the below task selection loop ++ * doesn't get stuck on the currently eligible stop task. ++ * ++ * We're currently inside stop_machine() and the rq is either stuck ++ * in the stop_machine_cpu_stop() loop, or we're executing this code, ++ * either way we should never end up calling schedule() until we're ++ * done here. ++ */ ++ rq->stop = NULL; ++ ++ p = rq_first_bmq_task(rq); ++ while (p != rq->idle) { ++ int dest_cpu; ++ ++ /* skip the running task */ ++ if (task_running(p) || 1 == p->nr_cpus_allowed) { ++ p = rq_next_bmq_task(p, rq); ++ continue; ++ } ++ ++ /* ++ * Rules for changing task_struct::cpus_allowed are holding ++ * both pi_lock and rq->lock, such that holding either ++ * stabilizes the mask. ++ * ++ * Drop rq->lock is not quite as disastrous as it usually is ++ * because !cpu_active at this point, which means load-balance ++ * will not interfere. Also, stop-machine. ++ */ ++ raw_spin_unlock(&rq->lock); ++ raw_spin_lock(&p->pi_lock); ++ raw_spin_lock(&rq->lock); ++ ++ /* ++ * Since we're inside stop-machine, _nothing_ should have ++ * changed the task, WARN if weird stuff happened, because in ++ * that case the above rq->lock drop is a fail too. ++ */ ++ if (WARN_ON(task_rq(p) != rq || !task_on_rq_queued(p))) { ++ raw_spin_unlock(&p->pi_lock); ++ p = rq_next_bmq_task(p, rq); ++ continue; ++ } ++ ++ count++; ++ /* Find suitable destination for @next, with force if needed. */ ++ dest_cpu = select_fallback_rq(dead_rq->cpu, p); ++ rq = __migrate_task(rq, p, dest_cpu); ++ raw_spin_unlock(&rq->lock); ++ raw_spin_unlock(&p->pi_lock); ++ ++ rq = dead_rq; ++ raw_spin_lock(&rq->lock); ++ /* Check queued task all over from the header again */ ++ p = rq_first_bmq_task(rq); ++ } ++ ++ rq->stop = stop; ++} ++ ++static void set_rq_offline(struct rq *rq) ++{ ++ if (rq->online) ++ rq->online = false; ++} ++#endif /* CONFIG_HOTPLUG_CPU */ ++ ++static void set_rq_online(struct rq *rq) ++{ ++ if (!rq->online) ++ rq->online = true; ++} ++ ++/* ++ * used to mark begin/end of suspend/resume: ++ */ ++static int num_cpus_frozen; ++ ++/* ++ * Update cpusets according to cpu_active mask. If cpusets are ++ * disabled, cpuset_update_active_cpus() becomes a simple wrapper ++ * around partition_sched_domains(). ++ * ++ * If we come here as part of a suspend/resume, don't touch cpusets because we ++ * want to restore it back to its original state upon resume anyway. ++ */ ++static void cpuset_cpu_active(void) ++{ ++ if (cpuhp_tasks_frozen) { ++ /* ++ * num_cpus_frozen tracks how many CPUs are involved in suspend ++ * resume sequence. As long as this is not the last online ++ * operation in the resume sequence, just build a single sched ++ * domain, ignoring cpusets. ++ */ ++ partition_sched_domains(1, NULL, NULL); ++ if (--num_cpus_frozen) ++ return; ++ /* ++ * This is the last CPU online operation. So fall through and ++ * restore the original sched domains by considering the ++ * cpuset configurations. ++ */ ++ cpuset_force_rebuild(); ++ } ++ ++ cpuset_update_active_cpus(); ++} ++ ++static int cpuset_cpu_inactive(unsigned int cpu) ++{ ++ if (!cpuhp_tasks_frozen) { ++ cpuset_update_active_cpus(); ++ } else { ++ num_cpus_frozen++; ++ partition_sched_domains(1, NULL, NULL); ++ } ++ return 0; ++} ++ ++int sched_cpu_activate(unsigned int cpu) ++{ ++ struct rq *rq = cpu_rq(cpu); ++ unsigned long flags; ++ ++#ifdef CONFIG_SCHED_SMT ++ /* ++ * When going up, increment the number of cores with SMT present. ++ */ ++ if (cpumask_weight(cpu_smt_mask(cpu)) == 2) ++ static_branch_inc_cpuslocked(&sched_smt_present); ++#endif ++ set_cpu_active(cpu, true); ++ ++ if (sched_smp_initialized) ++ cpuset_cpu_active(); ++ ++ /* ++ * Put the rq online, if not already. This happens: ++ * ++ * 1) In the early boot process, because we build the real domains ++ * after all cpus have been brought up. ++ * ++ * 2) At runtime, if cpuset_cpu_active() fails to rebuild the ++ * domains. ++ */ ++ raw_spin_lock_irqsave(&rq->lock, flags); ++ set_rq_online(rq); ++ raw_spin_unlock_irqrestore(&rq->lock, flags); ++ ++ return 0; ++} ++ ++int sched_cpu_deactivate(unsigned int cpu) ++{ ++ int ret; ++ ++ set_cpu_active(cpu, false); ++ /* ++ * We've cleared cpu_active_mask, wait for all preempt-disabled and RCU ++ * users of this state to go away such that all new such users will ++ * observe it. ++ * ++ * Do sync before park smpboot threads to take care the rcu boost case. ++ */ ++ synchronize_rcu(); ++ ++#ifdef CONFIG_SCHED_SMT ++ /* ++ * When going down, decrement the number of cores with SMT present. ++ */ ++ if (cpumask_weight(cpu_smt_mask(cpu)) == 2) { ++ static_branch_dec_cpuslocked(&sched_smt_present); ++ if (!static_branch_likely(&sched_smt_present)) ++ cpumask_clear(&sched_sg_idle_mask); ++ } ++#endif ++ ++ if (!sched_smp_initialized) ++ return 0; ++ ++ ret = cpuset_cpu_inactive(cpu); ++ if (ret) { ++ set_cpu_active(cpu, true); ++ return ret; ++ } ++ return 0; ++} ++ ++static void sched_rq_cpu_starting(unsigned int cpu) ++{ ++ struct rq *rq = cpu_rq(cpu); ++ ++ rq->calc_load_update = calc_load_update; ++} ++ ++int sched_cpu_starting(unsigned int cpu) ++{ ++ sched_rq_cpu_starting(cpu); ++ sched_tick_start(cpu); ++ return 0; ++} ++ ++#ifdef CONFIG_HOTPLUG_CPU ++int sched_cpu_dying(unsigned int cpu) ++{ ++ struct rq *rq = cpu_rq(cpu); ++ unsigned long flags; ++ ++ sched_tick_stop(cpu); ++ raw_spin_lock_irqsave(&rq->lock, flags); ++ set_rq_offline(rq); ++ migrate_tasks(rq); ++ raw_spin_unlock_irqrestore(&rq->lock, flags); ++ ++ hrtick_clear(rq); ++ return 0; ++} ++#endif ++ ++#ifdef CONFIG_SMP ++static void sched_init_topology_cpumask_early(void) ++{ ++ int cpu, level; ++ cpumask_t *tmp; ++ ++ for_each_possible_cpu(cpu) { ++ for (level = 0; level < NR_CPU_AFFINITY_CHK_LEVEL; level++) { ++ tmp = &(per_cpu(sched_cpu_affinity_masks, cpu)[level]); ++ cpumask_copy(tmp, cpu_possible_mask); ++ cpumask_clear_cpu(cpu, tmp); ++ } ++ per_cpu(sched_cpu_llc_mask, cpu) = ++ &(per_cpu(sched_cpu_affinity_masks, cpu)[0]); ++ per_cpu(sched_cpu_affinity_end_mask, cpu) = ++ &(per_cpu(sched_cpu_affinity_masks, cpu)[1]); ++ per_cpu(sd_llc_id, cpu) = cpu; ++ } ++} ++ ++#define TOPOLOGY_CPUMASK(name, func) \ ++ if (cpumask_and(chk, chk, func(cpu))) { \ ++ per_cpu(sched_cpu_llc_mask, cpu) = chk; \ ++ per_cpu(sd_llc_id, cpu) = cpumask_first(func(cpu)); \ ++ printk(KERN_INFO "bmq: cpu#%d affinity mask - "#name" 0x%08lx", \ ++ cpu, (chk++)->bits[0]); \ ++ } \ ++ cpumask_complement(chk, func(cpu)) ++ ++static void sched_init_topology_cpumask(void) ++{ ++ int cpu; ++ cpumask_t *chk; ++ ++ for_each_online_cpu(cpu) { ++ chk = &(per_cpu(sched_cpu_affinity_masks, cpu)[0]); ++ ++ cpumask_complement(chk, cpumask_of(cpu)); ++#ifdef CONFIG_SCHED_SMT ++ TOPOLOGY_CPUMASK(smt, topology_sibling_cpumask); ++#endif ++#ifdef CONFIG_SCHED_MC ++ TOPOLOGY_CPUMASK(coregroup, cpu_coregroup_mask); ++#endif ++ ++ TOPOLOGY_CPUMASK(core, topology_core_cpumask); ++ ++ if (cpumask_and(chk, chk, cpu_online_mask)) ++ printk(KERN_INFO "bmq: cpu#%d affinity mask - others 0x%08lx", ++ cpu, (chk++)->bits[0]); ++ ++ per_cpu(sched_cpu_affinity_end_mask, cpu) = chk; ++ printk(KERN_INFO "bmq: cpu#%d llc_id = %d, llc_mask idx = %ld\n", ++ cpu, per_cpu(sd_llc_id, cpu), ++ per_cpu(sched_cpu_llc_mask, cpu) - ++ &(per_cpu(sched_cpu_affinity_masks, cpu)[0])); ++ } ++} ++#endif ++ ++void __init sched_init_smp(void) ++{ ++ /* Move init over to a non-isolated CPU */ ++ if (set_cpus_allowed_ptr(current, housekeeping_cpumask(HK_FLAG_DOMAIN)) < 0) ++ BUG(); ++ ++ sched_init_topology_cpumask(); ++ ++ sched_smp_initialized = true; ++} ++#else ++void __init sched_init_smp(void) ++{ ++} ++#endif /* CONFIG_SMP */ ++ ++int in_sched_functions(unsigned long addr) ++{ ++ return in_lock_functions(addr) || ++ (addr >= (unsigned long)__sched_text_start ++ && addr < (unsigned long)__sched_text_end); ++} ++ ++#ifdef CONFIG_CGROUP_SCHED ++/* task group related information */ ++struct task_group { ++ struct cgroup_subsys_state css; ++ ++ struct rcu_head rcu; ++ struct list_head list; ++ ++ struct task_group *parent; ++ struct list_head siblings; ++ struct list_head children; ++}; ++ ++/* ++ * Default task group. ++ * Every task in system belongs to this group at bootup. ++ */ ++struct task_group root_task_group; ++LIST_HEAD(task_groups); ++ ++/* Cacheline aligned slab cache for task_group */ ++static struct kmem_cache *task_group_cache __read_mostly; ++#endif /* CONFIG_CGROUP_SCHED */ ++ ++void __init sched_init(void) ++{ ++ int i; ++ struct rq *rq; ++ ++ print_scheduler_version(); ++ ++ wait_bit_init(); ++ ++#ifdef CONFIG_SMP ++ for (i = 0; i < bmq_BITS; i++) ++ cpumask_copy(&sched_rq_watermark[i], cpu_present_mask); ++#endif ++ ++#ifdef CONFIG_CGROUP_SCHED ++ task_group_cache = KMEM_CACHE(task_group, 0); ++ ++ list_add(&root_task_group.list, &task_groups); ++ INIT_LIST_HEAD(&root_task_group.children); ++ INIT_LIST_HEAD(&root_task_group.siblings); ++#endif /* CONFIG_CGROUP_SCHED */ ++ for_each_possible_cpu(i) { ++ rq = cpu_rq(i); ++ ++ bmq_init(&rq->queue); ++ rq->watermark = IDLE_WM; ++ rq->skip = NULL; ++ ++ raw_spin_lock_init(&rq->lock); ++ rq->nr_running = rq->nr_uninterruptible = 0; ++ rq->calc_load_active = 0; ++ rq->calc_load_update = jiffies + LOAD_FREQ; ++#ifdef CONFIG_SMP ++ rq->online = false; ++ rq->cpu = i; ++ ++#ifdef CONFIG_SCHED_SMT ++ rq->active_balance = 0; ++#endif ++#endif ++ rq->nr_switches = 0; ++ atomic_set(&rq->nr_iowait, 0); ++ hrtick_rq_init(rq); ++ } ++#ifdef CONFIG_SMP ++ /* Set rq->online for cpu 0 */ ++ cpu_rq(0)->online = true; ++#endif ++ ++ /* ++ * The boot idle thread does lazy MMU switching as well: ++ */ ++ mmgrab(&init_mm); ++ enter_lazy_tlb(&init_mm, current); ++ ++ /* ++ * Make us the idle thread. Technically, schedule() should not be ++ * called from this thread, however somewhere below it might be, ++ * but because we are the idle thread, we just pick up running again ++ * when this runqueue becomes "idle". ++ */ ++ init_idle(current, smp_processor_id()); ++ ++ calc_load_update = jiffies + LOAD_FREQ; ++ ++#ifdef CONFIG_SMP ++ idle_thread_set_boot_cpu(); ++ ++ sched_init_topology_cpumask_early(); ++#endif /* SMP */ ++ ++ init_schedstats(); ++ ++ psi_init(); ++} ++ ++#ifdef CONFIG_DEBUG_ATOMIC_SLEEP ++static inline int preempt_count_equals(int preempt_offset) ++{ ++ int nested = preempt_count() + rcu_preempt_depth(); ++ ++ return (nested == preempt_offset); ++} ++ ++void __might_sleep(const char *file, int line, int preempt_offset) ++{ ++ /* ++ * Blocking primitives will set (and therefore destroy) current->state, ++ * since we will exit with TASK_RUNNING make sure we enter with it, ++ * otherwise we will destroy state. ++ */ ++ WARN_ONCE(current->state != TASK_RUNNING && current->task_state_change, ++ "do not call blocking ops when !TASK_RUNNING; " ++ "state=%lx set at [<%p>] %pS\n", ++ current->state, ++ (void *)current->task_state_change, ++ (void *)current->task_state_change); ++ ++ ___might_sleep(file, line, preempt_offset); ++} ++EXPORT_SYMBOL(__might_sleep); ++ ++void ___might_sleep(const char *file, int line, int preempt_offset) ++{ ++ /* Ratelimiting timestamp: */ ++ static unsigned long prev_jiffy; ++ ++ unsigned long preempt_disable_ip; ++ ++ /* WARN_ON_ONCE() by default, no rate limit required: */ ++ rcu_sleep_check(); ++ ++ if ((preempt_count_equals(preempt_offset) && !irqs_disabled() && ++ !is_idle_task(current) && !current->non_block_count) || ++ system_state == SYSTEM_BOOTING || system_state > SYSTEM_RUNNING || ++ oops_in_progress) ++ return; ++ if (time_before(jiffies, prev_jiffy + HZ) && prev_jiffy) ++ return; ++ prev_jiffy = jiffies; ++ ++ /* Save this before calling printk(), since that will clobber it: */ ++ preempt_disable_ip = get_preempt_disable_ip(current); ++ ++ printk(KERN_ERR ++ "BUG: sleeping function called from invalid context at %s:%d\n", ++ file, line); ++ printk(KERN_ERR ++ "in_atomic(): %d, irqs_disabled(): %d, non_block: %d, pid: %d, name: %s\n", ++ in_atomic(), irqs_disabled(), current->non_block_count, ++ current->pid, current->comm); ++ ++ if (task_stack_end_corrupted(current)) ++ printk(KERN_EMERG "Thread overran stack, or stack corrupted\n"); ++ ++ debug_show_held_locks(current); ++ if (irqs_disabled()) ++ print_irqtrace_events(current); ++#ifdef CONFIG_DEBUG_PREEMPT ++ if (!preempt_count_equals(preempt_offset)) { ++ pr_err("Preemption disabled at:"); ++ print_ip_sym(preempt_disable_ip); ++ pr_cont("\n"); ++ } ++#endif ++ dump_stack(); ++ add_taint(TAINT_WARN, LOCKDEP_STILL_OK); ++} ++EXPORT_SYMBOL(___might_sleep); ++ ++void __cant_sleep(const char *file, int line, int preempt_offset) ++{ ++ static unsigned long prev_jiffy; ++ ++ if (irqs_disabled()) ++ return; ++ ++ if (!IS_ENABLED(CONFIG_PREEMPT_COUNT)) ++ return; ++ ++ if (preempt_count() > preempt_offset) ++ return; ++ ++ if (time_before(jiffies, prev_jiffy + HZ) && prev_jiffy) ++ return; ++ prev_jiffy = jiffies; ++ ++ printk(KERN_ERR "BUG: assuming atomic context at %s:%d\n", file, line); ++ printk(KERN_ERR "in_atomic(): %d, irqs_disabled(): %d, pid: %d, name: %s\n", ++ in_atomic(), irqs_disabled(), ++ current->pid, current->comm); ++ ++ debug_show_held_locks(current); ++ dump_stack(); ++ add_taint(TAINT_WARN, LOCKDEP_STILL_OK); ++} ++EXPORT_SYMBOL_GPL(__cant_sleep); ++#endif ++ ++#ifdef CONFIG_MAGIC_SYSRQ ++void normalize_rt_tasks(void) ++{ ++ struct task_struct *g, *p; ++ struct sched_attr attr = { ++ .sched_policy = SCHED_NORMAL, ++ }; ++ ++ read_lock(&tasklist_lock); ++ for_each_process_thread(g, p) { ++ /* ++ * Only normalize user tasks: ++ */ ++ if (p->flags & PF_KTHREAD) ++ continue; ++ ++ if (!rt_task(p)) { ++ /* ++ * Renice negative nice level userspace ++ * tasks back to 0: ++ */ ++ if (task_nice(p) < 0) ++ set_user_nice(p, 0); ++ continue; ++ } ++ ++ __sched_setscheduler(p, &attr, false, false); ++ } ++ read_unlock(&tasklist_lock); ++} ++#endif /* CONFIG_MAGIC_SYSRQ */ ++ ++#if defined(CONFIG_IA64) || defined(CONFIG_KGDB_KDB) ++/* ++ * These functions are only useful for the IA64 MCA handling, or kdb. ++ * ++ * They can only be called when the whole system has been ++ * stopped - every CPU needs to be quiescent, and no scheduling ++ * activity can take place. Using them for anything else would ++ * be a serious bug, and as a result, they aren't even visible ++ * under any other configuration. ++ */ ++ ++/** ++ * curr_task - return the current task for a given CPU. ++ * @cpu: the processor in question. ++ * ++ * ONLY VALID WHEN THE WHOLE SYSTEM IS STOPPED! ++ * ++ * Return: The current task for @cpu. ++ */ ++struct task_struct *curr_task(int cpu) ++{ ++ return cpu_curr(cpu); ++} ++ ++#endif /* defined(CONFIG_IA64) || defined(CONFIG_KGDB_KDB) */ ++ ++#ifdef CONFIG_IA64 ++/** ++ * ia64_set_curr_task - set the current task for a given CPU. ++ * @cpu: the processor in question. ++ * @p: the task pointer to set. ++ * ++ * Description: This function must only be used when non-maskable interrupts ++ * are serviced on a separate stack. It allows the architecture to switch the ++ * notion of the current task on a CPU in a non-blocking manner. This function ++ * must be called with all CPU's synchronised, and interrupts disabled, the ++ * and caller must save the original value of the current task (see ++ * curr_task() above) and restore that value before reenabling interrupts and ++ * re-starting the system. ++ * ++ * ONLY VALID WHEN THE WHOLE SYSTEM IS STOPPED! ++ */ ++void ia64_set_curr_task(int cpu, struct task_struct *p) ++{ ++ cpu_curr(cpu) = p; ++} ++ ++#endif ++ ++#ifdef CONFIG_CGROUP_SCHED ++static void sched_free_group(struct task_group *tg) ++{ ++ kmem_cache_free(task_group_cache, tg); ++} ++ ++/* allocate runqueue etc for a new task group */ ++struct task_group *sched_create_group(struct task_group *parent) ++{ ++ struct task_group *tg; ++ ++ tg = kmem_cache_alloc(task_group_cache, GFP_KERNEL | __GFP_ZERO); ++ if (!tg) ++ return ERR_PTR(-ENOMEM); ++ ++ return tg; ++} ++ ++void sched_online_group(struct task_group *tg, struct task_group *parent) ++{ ++} ++ ++/* rcu callback to free various structures associated with a task group */ ++static void sched_free_group_rcu(struct rcu_head *rhp) ++{ ++ /* Now it should be safe to free those cfs_rqs */ ++ sched_free_group(container_of(rhp, struct task_group, rcu)); ++} ++ ++void sched_destroy_group(struct task_group *tg) ++{ ++ /* Wait for possible concurrent references to cfs_rqs complete */ ++ call_rcu(&tg->rcu, sched_free_group_rcu); ++} ++ ++void sched_offline_group(struct task_group *tg) ++{ ++} ++ ++static inline struct task_group *css_tg(struct cgroup_subsys_state *css) ++{ ++ return css ? container_of(css, struct task_group, css) : NULL; ++} ++ ++static struct cgroup_subsys_state * ++cpu_cgroup_css_alloc(struct cgroup_subsys_state *parent_css) ++{ ++ struct task_group *parent = css_tg(parent_css); ++ struct task_group *tg; ++ ++ if (!parent) { ++ /* This is early initialization for the top cgroup */ ++ return &root_task_group.css; ++ } ++ ++ tg = sched_create_group(parent); ++ if (IS_ERR(tg)) ++ return ERR_PTR(-ENOMEM); ++ return &tg->css; ++} ++ ++/* Expose task group only after completing cgroup initialization */ ++static int cpu_cgroup_css_online(struct cgroup_subsys_state *css) ++{ ++ struct task_group *tg = css_tg(css); ++ struct task_group *parent = css_tg(css->parent); ++ ++ if (parent) ++ sched_online_group(tg, parent); ++ return 0; ++} ++ ++static void cpu_cgroup_css_released(struct cgroup_subsys_state *css) ++{ ++ struct task_group *tg = css_tg(css); ++ ++ sched_offline_group(tg); ++} ++ ++static void cpu_cgroup_css_free(struct cgroup_subsys_state *css) ++{ ++ struct task_group *tg = css_tg(css); ++ ++ /* ++ * Relies on the RCU grace period between css_released() and this. ++ */ ++ sched_free_group(tg); ++} ++ ++static void cpu_cgroup_fork(struct task_struct *task) ++{ ++} ++ ++static int cpu_cgroup_can_attach(struct cgroup_taskset *tset) ++{ ++ return 0; ++} ++ ++static void cpu_cgroup_attach(struct cgroup_taskset *tset) ++{ ++} ++ ++static struct cftype cpu_legacy_files[] = { ++ { } /* Terminate */ ++}; ++ ++static struct cftype cpu_files[] = { ++ { } /* terminate */ ++}; ++ ++static int cpu_extra_stat_show(struct seq_file *sf, ++ struct cgroup_subsys_state *css) ++{ ++ return 0; ++} ++ ++struct cgroup_subsys cpu_cgrp_subsys = { ++ .css_alloc = cpu_cgroup_css_alloc, ++ .css_online = cpu_cgroup_css_online, ++ .css_released = cpu_cgroup_css_released, ++ .css_free = cpu_cgroup_css_free, ++ .css_extra_stat_show = cpu_extra_stat_show, ++ .fork = cpu_cgroup_fork, ++ .can_attach = cpu_cgroup_can_attach, ++ .attach = cpu_cgroup_attach, ++ .legacy_cftypes = cpu_files, ++ .legacy_cftypes = cpu_legacy_files, ++ .dfl_cftypes = cpu_files, ++ .early_init = true, ++ .threaded = true, ++}; ++#endif /* CONFIG_CGROUP_SCHED */ ++ ++#undef CREATE_TRACE_POINTS +diff --git a/kernel/sched/bmq_debug.c b/kernel/sched/bmq_debug.c +new file mode 100644 +index 000000000000..375a1a805d86 +--- /dev/null ++++ b/kernel/sched/bmq_debug.c +@@ -0,0 +1,31 @@ ++/* ++ * kernel/sched/bmq_debug.c ++ * ++ * Print the BMQ debugging details ++ * ++ * Author: Alfred Chen ++ * Date : 2020 ++ */ ++#include "bmq_sched.h" ++ ++/* ++ * This allows printing both to /proc/sched_debug and ++ * to the console ++ */ ++#define SEQ_printf(m, x...) \ ++ do { \ ++ if (m) \ ++ seq_printf(m, x); \ ++ else \ ++ pr_cont(x); \ ++ } while (0) ++ ++void proc_sched_show_task(struct task_struct *p, struct pid_namespace *ns, ++ struct seq_file *m) ++{ ++ SEQ_printf(m, "%s (%d, #threads: %d)\n", p->comm, task_pid_nr_ns(p, ns), ++ get_nr_threads(p)); ++} ++ ++void proc_sched_set_task(struct task_struct *p) ++{} +diff --git a/kernel/sched/bmq_sched.h b/kernel/sched/bmq_sched.h +new file mode 100644 +index 000000000000..449d6b54a253 +--- /dev/null ++++ b/kernel/sched/bmq_sched.h +@@ -0,0 +1,509 @@ ++#ifndef BMQ_SCHED_H ++#define BMQ_SCHED_H ++ ++#include ++ ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++ ++#include ++ ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++ ++#include ++ ++#ifdef CONFIG_PARAVIRT ++# include ++#endif ++ ++#include "cpupri.h" ++ ++/* task_struct::on_rq states: */ ++#define TASK_ON_RQ_QUEUED 1 ++#define TASK_ON_RQ_MIGRATING 2 ++ ++static inline int task_on_rq_queued(struct task_struct *p) ++{ ++ return p->on_rq == TASK_ON_RQ_QUEUED; ++} ++ ++static inline int task_on_rq_migrating(struct task_struct *p) ++{ ++ return READ_ONCE(p->on_rq) == TASK_ON_RQ_MIGRATING; ++} ++ ++/* ++ * wake flags ++ */ ++#define WF_SYNC 0x01 /* waker goes to sleep after wakeup */ ++#define WF_FORK 0x02 /* child wakeup after fork */ ++#define WF_MIGRATED 0x04 /* internal use, task got migrated */ ++ ++/* bits: ++ * RT, Low prio adj range, nice width, high prio adj range, cpu idle task */ ++#define bmq_BITS (NICE_WIDTH + 2 * MAX_PRIORITY_ADJ + 2) ++#define IDLE_TASK_SCHED_PRIO (bmq_BITS - 1) ++ ++struct bmq { ++ DECLARE_BITMAP(bitmap, bmq_BITS); ++ struct list_head heads[bmq_BITS]; ++}; ++ ++/* ++ * This is the main, per-CPU runqueue data structure. ++ * This data should only be modified by the local cpu. ++ */ ++struct rq { ++ /* runqueue lock: */ ++ raw_spinlock_t lock; ++ ++ struct task_struct *curr, *idle, *stop, *skip; ++ struct mm_struct *prev_mm; ++ ++ struct bmq queue; ++ unsigned long watermark; ++ ++ /* switch count */ ++ u64 nr_switches; ++ ++ atomic_t nr_iowait; ++ ++#ifdef CONFIG_MEMBARRIER ++ int membarrier_state; ++#endif ++ ++#ifdef CONFIG_SMP ++ int cpu; /* cpu of this runqueue */ ++ bool online; ++ ++#ifdef CONFIG_HAVE_SCHED_AVG_IRQ ++ struct sched_avg avg_irq; ++#endif ++ ++#ifdef CONFIG_SCHED_SMT ++ int active_balance; ++ struct cpu_stop_work active_balance_work; ++#endif ++#endif /* CONFIG_SMP */ ++#ifdef CONFIG_IRQ_TIME_ACCOUNTING ++ u64 prev_irq_time; ++#endif /* CONFIG_IRQ_TIME_ACCOUNTING */ ++#ifdef CONFIG_PARAVIRT ++ u64 prev_steal_time; ++#endif /* CONFIG_PARAVIRT */ ++#ifdef CONFIG_PARAVIRT_TIME_ACCOUNTING ++ u64 prev_steal_time_rq; ++#endif /* CONFIG_PARAVIRT_TIME_ACCOUNTING */ ++ ++ /* calc_load related fields */ ++ unsigned long calc_load_update; ++ long calc_load_active; ++ ++ u64 clock, last_tick; ++ u64 last_ts_switch; ++ u64 clock_task; ++ ++ unsigned long nr_running; ++ unsigned long nr_uninterruptible; ++ ++#ifdef CONFIG_SCHED_HRTICK ++#ifdef CONFIG_SMP ++ int hrtick_csd_pending; ++ call_single_data_t hrtick_csd; ++#endif ++ struct hrtimer hrtick_timer; ++#endif ++ ++#ifdef CONFIG_SCHEDSTATS ++ ++ /* latency stats */ ++ struct sched_info rq_sched_info; ++ unsigned long long rq_cpu_time; ++ /* could above be rq->cfs_rq.exec_clock + rq->rt_rq.rt_runtime ? */ ++ ++ /* sys_sched_yield() stats */ ++ unsigned int yld_count; ++ ++ /* schedule() stats */ ++ unsigned int sched_switch; ++ unsigned int sched_count; ++ unsigned int sched_goidle; ++ ++ /* try_to_wake_up() stats */ ++ unsigned int ttwu_count; ++ unsigned int ttwu_local; ++#endif /* CONFIG_SCHEDSTATS */ ++#ifdef CONFIG_CPU_IDLE ++ /* Must be inspected within a rcu lock section */ ++ struct cpuidle_state *idle_state; ++#endif ++}; ++ ++extern unsigned long calc_load_update; ++extern atomic_long_t calc_load_tasks; ++ ++extern void calc_global_load_tick(struct rq *this_rq); ++extern long calc_load_fold_active(struct rq *this_rq, long adjust); ++ ++DECLARE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues); ++#define cpu_rq(cpu) (&per_cpu(runqueues, (cpu))) ++#define this_rq() this_cpu_ptr(&runqueues) ++#define task_rq(p) cpu_rq(task_cpu(p)) ++#define cpu_curr(cpu) (cpu_rq(cpu)->curr) ++#define raw_rq() raw_cpu_ptr(&runqueues) ++ ++#ifdef CONFIG_SMP ++#if defined(CONFIG_SCHED_DEBUG) && defined(CONFIG_SYSCTL) ++void register_sched_domain_sysctl(void); ++void unregister_sched_domain_sysctl(void); ++#else ++static inline void register_sched_domain_sysctl(void) ++{ ++} ++static inline void unregister_sched_domain_sysctl(void) ++{ ++} ++#endif ++ ++extern bool sched_smp_initialized; ++ ++enum { ++ BASE_CPU_AFFINITY_CHK_LEVEL = 1, ++#ifdef CONFIG_SCHED_SMT ++ SMT_CPU_AFFINITY_CHK_LEVEL_SPACE_HOLDER, ++#endif ++#ifdef CONFIG_SCHED_MC ++ MC_CPU_AFFINITY_CHK_LEVEL_SPACE_HOLDER, ++#endif ++ NR_CPU_AFFINITY_CHK_LEVEL ++}; ++ ++DECLARE_PER_CPU(cpumask_t [NR_CPU_AFFINITY_CHK_LEVEL], sched_cpu_affinity_masks); ++ ++static inline int __best_mask_cpu(int cpu, const cpumask_t *cpumask, ++ const cpumask_t *mask) ++{ ++ while ((cpu = cpumask_any_and(cpumask, mask)) >= nr_cpu_ids) ++ mask++; ++ return cpu; ++} ++ ++static inline int best_mask_cpu(int cpu, const cpumask_t *cpumask) ++{ ++ return cpumask_test_cpu(cpu, cpumask)? cpu : ++ __best_mask_cpu(cpu, cpumask, &(per_cpu(sched_cpu_affinity_masks, cpu)[0])); ++} ++ ++#endif /* CONFIG_SMP */ ++ ++#ifndef arch_scale_freq_capacity ++static __always_inline ++unsigned long arch_scale_freq_capacity(int cpu) ++{ ++ return SCHED_CAPACITY_SCALE; ++} ++#endif ++ ++static inline u64 __rq_clock_broken(struct rq *rq) ++{ ++ return READ_ONCE(rq->clock); ++} ++ ++static inline u64 rq_clock(struct rq *rq) ++{ ++ /* ++ * Relax lockdep_assert_held() checking as in VRQ, call to ++ * sched_info_xxxx() may not held rq->lock ++ * lockdep_assert_held(&rq->lock); ++ */ ++ return rq->clock; ++} ++ ++static inline u64 rq_clock_task(struct rq *rq) ++{ ++ /* ++ * Relax lockdep_assert_held() checking as in VRQ, call to ++ * sched_info_xxxx() may not held rq->lock ++ * lockdep_assert_held(&rq->lock); ++ */ ++ return rq->clock_task; ++} ++ ++/* ++ * {de,en}queue flags: ++ * ++ * DEQUEUE_SLEEP - task is no longer runnable ++ * ENQUEUE_WAKEUP - task just became runnable ++ * ++ */ ++ ++#define DEQUEUE_SLEEP 0x01 ++ ++#define ENQUEUE_WAKEUP 0x01 ++ ++ ++/* ++ * Below are scheduler API which using in other kernel code ++ * It use the dummy rq_flags ++ * ToDo : BMQ need to support these APIs for compatibility with mainline ++ * scheduler code. ++ */ ++struct rq_flags { ++ unsigned long flags; ++}; ++ ++struct rq *__task_rq_lock(struct task_struct *p, struct rq_flags *rf) ++ __acquires(rq->lock); ++ ++struct rq *task_rq_lock(struct task_struct *p, struct rq_flags *rf) ++ __acquires(p->pi_lock) ++ __acquires(rq->lock); ++ ++static inline void __task_rq_unlock(struct rq *rq, struct rq_flags *rf) ++ __releases(rq->lock) ++{ ++ raw_spin_unlock(&rq->lock); ++} ++ ++static inline void ++task_rq_unlock(struct rq *rq, struct task_struct *p, struct rq_flags *rf) ++ __releases(rq->lock) ++ __releases(p->pi_lock) ++{ ++ raw_spin_unlock(&rq->lock); ++ raw_spin_unlock_irqrestore(&p->pi_lock, rf->flags); ++} ++ ++static inline void ++rq_unlock_irq(struct rq *rq, struct rq_flags *rf) ++ __releases(rq->lock) ++{ ++ raw_spin_unlock_irq(&rq->lock); ++} ++ ++static inline struct rq * ++this_rq_lock_irq(struct rq_flags *rf) ++ __acquires(rq->lock) ++{ ++ struct rq *rq; ++ ++ local_irq_disable(); ++ rq = this_rq(); ++ raw_spin_lock(&rq->lock); ++ ++ return rq; ++} ++ ++static inline bool task_running(struct task_struct *p) ++{ ++ return p->on_cpu; ++} ++ ++extern struct static_key_false sched_schedstats; ++ ++static inline void sched_ttwu_pending(void) { } ++ ++#ifdef CONFIG_CPU_IDLE ++static inline void idle_set_state(struct rq *rq, ++ struct cpuidle_state *idle_state) ++{ ++ rq->idle_state = idle_state; ++} ++ ++static inline struct cpuidle_state *idle_get_state(struct rq *rq) ++{ ++ WARN_ON(!rcu_read_lock_held()); ++ return rq->idle_state; ++} ++#else ++static inline void idle_set_state(struct rq *rq, ++ struct cpuidle_state *idle_state) ++{ ++} ++ ++static inline struct cpuidle_state *idle_get_state(struct rq *rq) ++{ ++ return NULL; ++} ++#endif ++ ++static inline int cpu_of(const struct rq *rq) ++{ ++#ifdef CONFIG_SMP ++ return rq->cpu; ++#else ++ return 0; ++#endif ++} ++ ++#include "stats.h" ++ ++#ifdef CONFIG_IRQ_TIME_ACCOUNTING ++struct irqtime { ++ u64 total; ++ u64 tick_delta; ++ u64 irq_start_time; ++ struct u64_stats_sync sync; ++}; ++ ++DECLARE_PER_CPU(struct irqtime, cpu_irqtime); ++ ++/* ++ * Returns the irqtime minus the softirq time computed by ksoftirqd. ++ * Otherwise ksoftirqd's sum_exec_runtime is substracted its own runtime ++ * and never move forward. ++ */ ++static inline u64 irq_time_read(int cpu) ++{ ++ struct irqtime *irqtime = &per_cpu(cpu_irqtime, cpu); ++ unsigned int seq; ++ u64 total; ++ ++ do { ++ seq = __u64_stats_fetch_begin(&irqtime->sync); ++ total = irqtime->total; ++ } while (__u64_stats_fetch_retry(&irqtime->sync, seq)); ++ ++ return total; ++} ++#endif /* CONFIG_IRQ_TIME_ACCOUNTING */ ++ ++#ifdef CONFIG_CPU_FREQ ++DECLARE_PER_CPU(struct update_util_data __rcu *, cpufreq_update_util_data); ++ ++/** ++ * cpufreq_update_util - Take a note about CPU utilization changes. ++ * @rq: Runqueue to carry out the update for. ++ * @flags: Update reason flags. ++ * ++ * This function is called by the scheduler on the CPU whose utilization is ++ * being updated. ++ * ++ * It can only be called from RCU-sched read-side critical sections. ++ * ++ * The way cpufreq is currently arranged requires it to evaluate the CPU ++ * performance state (frequency/voltage) on a regular basis to prevent it from ++ * being stuck in a completely inadequate performance level for too long. ++ * That is not guaranteed to happen if the updates are only triggered from CFS ++ * and DL, though, because they may not be coming in if only RT tasks are ++ * active all the time (or there are RT tasks only). ++ * ++ * As a workaround for that issue, this function is called periodically by the ++ * RT sched class to trigger extra cpufreq updates to prevent it from stalling, ++ * but that really is a band-aid. Going forward it should be replaced with ++ * solutions targeted more specifically at RT tasks. ++ */ ++static inline void cpufreq_update_util(struct rq *rq, unsigned int flags) ++{ ++ struct update_util_data *data; ++ ++ data = rcu_dereference_sched(*this_cpu_ptr(&cpufreq_update_util_data)); ++ if (data) ++ data->func(data, rq_clock(rq), flags); ++} ++#else ++static inline void cpufreq_update_util(struct rq *rq, unsigned int flags) {} ++#endif /* CONFIG_CPU_FREQ */ ++ ++#ifdef CONFIG_NO_HZ_FULL ++extern int __init sched_tick_offload_init(void); ++#else ++static inline int sched_tick_offload_init(void) { return 0; } ++#endif ++ ++#ifdef arch_scale_freq_capacity ++#ifndef arch_scale_freq_invariant ++#define arch_scale_freq_invariant() (true) ++#endif ++#else /* arch_scale_freq_capacity */ ++#define arch_scale_freq_invariant() (false) ++#endif ++ ++extern void schedule_idle(void); ++ ++/* ++ * !! For sched_setattr_nocheck() (kernel) only !! ++ * ++ * This is actually gross. :( ++ * ++ * It is used to make schedutil kworker(s) higher priority than SCHED_DEADLINE ++ * tasks, but still be able to sleep. We need this on platforms that cannot ++ * atomically change clock frequency. Remove once fast switching will be ++ * available on such platforms. ++ * ++ * SUGOV stands for SchedUtil GOVernor. ++ */ ++#define SCHED_FLAG_SUGOV 0x10000000 ++ ++#ifdef CONFIG_MEMBARRIER ++/* ++ * The scheduler provides memory barriers required by membarrier between: ++ * - prior user-space memory accesses and store to rq->membarrier_state, ++ * - store to rq->membarrier_state and following user-space memory accesses. ++ * In the same way it provides those guarantees around store to rq->curr. ++ */ ++static inline void membarrier_switch_mm(struct rq *rq, ++ struct mm_struct *prev_mm, ++ struct mm_struct *next_mm) ++{ ++ int membarrier_state; ++ ++ if (prev_mm == next_mm) ++ return; ++ ++ membarrier_state = atomic_read(&next_mm->membarrier_state); ++ if (READ_ONCE(rq->membarrier_state) == membarrier_state) ++ return; ++ ++ WRITE_ONCE(rq->membarrier_state, membarrier_state); ++} ++#else ++static inline void membarrier_switch_mm(struct rq *rq, ++ struct mm_struct *prev_mm, ++ struct mm_struct *next_mm) ++{ ++} ++#endif ++ ++static inline int task_running_nice(struct task_struct *p) ++{ ++ return (p->prio + p->boost_prio > DEFAULT_PRIO + MAX_PRIORITY_ADJ); ++} ++ ++#ifdef CONFIG_NUMA ++extern int sched_numa_find_closest(const struct cpumask *cpus, int cpu); ++#else ++static inline int sched_numa_find_closest(const struct cpumask *cpus, int cpu) ++{ ++ return nr_cpu_ids; ++} ++#endif ++#endif /* BMQ_SCHED_H */ +diff --git a/kernel/sched/cpufreq_schedutil.c b/kernel/sched/cpufreq_schedutil.c +index 9b8916fd00a2..9073fba046c8 100644 +--- a/kernel/sched/cpufreq_schedutil.c ++++ b/kernel/sched/cpufreq_schedutil.c +@@ -183,6 +183,7 @@ static unsigned int get_next_freq(struct sugov_policy *sg_policy, + return cpufreq_driver_resolve_freq(policy, freq); + } + ++#ifndef CONFIG_SCHED_BMQ + /* + * This function computes an effective utilization for the given CPU, to be + * used for frequency selection given the linear relation: f = u * f_max. +@@ -300,6 +301,13 @@ static unsigned long sugov_get_util(struct sugov_cpu *sg_cpu) + + return schedutil_cpu_util(sg_cpu->cpu, util, max, FREQUENCY_UTIL, NULL); + } ++#else /* CONFIG_SCHED_BMQ */ ++static unsigned long sugov_get_util(struct sugov_cpu *sg_cpu) ++{ ++ sg_cpu->max = arch_scale_cpu_capacity(sg_cpu->cpu); ++ return sg_cpu->max; ++} ++#endif + + /** + * sugov_iowait_reset() - Reset the IO boost status of a CPU. +@@ -443,7 +451,9 @@ static inline bool sugov_cpu_is_busy(struct sugov_cpu *sg_cpu) { return false; } + */ + static inline void ignore_dl_rate_limit(struct sugov_cpu *sg_cpu, struct sugov_policy *sg_policy) + { ++#ifndef CONFIG_SCHED_BMQ + if (cpu_bw_dl(cpu_rq(sg_cpu->cpu)) > sg_cpu->bw_dl) ++#endif + sg_policy->limits_changed = true; + } + +@@ -686,6 +696,7 @@ static int sugov_kthread_create(struct sugov_policy *sg_policy) + } + + ret = sched_setattr_nocheck(thread, &attr); ++ + if (ret) { + kthread_stop(thread); + pr_warn("%s: failed to set SCHED_DEADLINE\n", __func__); +@@ -916,6 +927,7 @@ static int __init sugov_register(void) + core_initcall(sugov_register); + + #ifdef CONFIG_ENERGY_MODEL ++#ifndef CONFIG_SCHED_BMQ + extern bool sched_energy_update; + extern struct mutex sched_energy_mutex; + +@@ -946,4 +958,10 @@ void sched_cpufreq_governor_change(struct cpufreq_policy *policy, + } + + } ++#else /* CONFIG_SCHED_BMQ */ ++void sched_cpufreq_governor_change(struct cpufreq_policy *policy, ++ struct cpufreq_governor *old_gov) ++{ ++} ++#endif + #endif +diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c +index d43318a489f2..1a312bb6f4a1 100644 +--- a/kernel/sched/cputime.c ++++ b/kernel/sched/cputime.c +@@ -122,7 +122,7 @@ void account_user_time(struct task_struct *p, u64 cputime) + p->utime += cputime; + account_group_user_time(p, cputime); + +- index = (task_nice(p) > 0) ? CPUTIME_NICE : CPUTIME_USER; ++ index = task_running_nice(p) ? CPUTIME_NICE : CPUTIME_USER; + + /* Add user time to cpustat. */ + task_group_account_field(p, index, cputime); +@@ -146,7 +146,7 @@ void account_guest_time(struct task_struct *p, u64 cputime) + p->gtime += cputime; + + /* Add guest time to cpustat. */ +- if (task_nice(p) > 0) { ++ if (task_running_nice(p)) { + cpustat[CPUTIME_NICE] += cputime; + cpustat[CPUTIME_GUEST_NICE] += cputime; + } else { +@@ -269,7 +269,7 @@ static inline u64 account_other_time(u64 max) + #ifdef CONFIG_64BIT + static inline u64 read_sum_exec_runtime(struct task_struct *t) + { +- return t->se.sum_exec_runtime; ++ return tsk_seruntime(t); + } + #else + static u64 read_sum_exec_runtime(struct task_struct *t) +@@ -279,7 +279,7 @@ static u64 read_sum_exec_runtime(struct task_struct *t) + struct rq *rq; + + rq = task_rq_lock(t, &rf); +- ns = t->se.sum_exec_runtime; ++ ns = tsk_seruntime(t); + task_rq_unlock(rq, t, &rf); + + return ns; +@@ -661,7 +661,7 @@ void cputime_adjust(struct task_cputime *curr, struct prev_cputime *prev, + void task_cputime_adjusted(struct task_struct *p, u64 *ut, u64 *st) + { + struct task_cputime cputime = { +- .sum_exec_runtime = p->se.sum_exec_runtime, ++ .sum_exec_runtime = tsk_seruntime(p), + }; + + task_cputime(p, &cputime.utime, &cputime.stime); +diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c +index ffa959e91227..469f36c89a9d 100644 +--- a/kernel/sched/idle.c ++++ b/kernel/sched/idle.c +@@ -361,6 +361,7 @@ void cpu_startup_entry(enum cpuhp_state state) + do_idle(); + } + ++#ifndef CONFIG_SCHED_BMQ + /* + * idle-task scheduling class. + */ +@@ -481,3 +482,4 @@ const struct sched_class idle_sched_class = { + .switched_to = switched_to_idle, + .update_curr = update_curr_idle, + }; ++#endif +diff --git a/kernel/sched/pelt.c b/kernel/sched/pelt.c +index a96db50d40e0..22c20e28b613 100644 +--- a/kernel/sched/pelt.c ++++ b/kernel/sched/pelt.c +@@ -236,6 +236,7 @@ ___update_load_avg(struct sched_avg *sa, unsigned long load, unsigned long runna + WRITE_ONCE(sa->util_avg, sa->util_sum / divider); + } + ++#ifndef CONFIG_SCHED_BMQ + /* + * sched_entity: + * +@@ -352,6 +353,7 @@ int update_dl_rq_load_avg(u64 now, struct rq *rq, int running) + + return 0; + } ++#endif + + #ifdef CONFIG_HAVE_SCHED_AVG_IRQ + /* +diff --git a/kernel/sched/pelt.h b/kernel/sched/pelt.h +index afff644da065..4da52afaeff8 100644 +--- a/kernel/sched/pelt.h ++++ b/kernel/sched/pelt.h +@@ -1,11 +1,13 @@ + #ifdef CONFIG_SMP + #include "sched-pelt.h" + ++#ifndef CONFIG_SCHED_BMQ + int __update_load_avg_blocked_se(u64 now, struct sched_entity *se); + int __update_load_avg_se(u64 now, struct cfs_rq *cfs_rq, struct sched_entity *se); + int __update_load_avg_cfs_rq(u64 now, struct cfs_rq *cfs_rq); + int update_rt_rq_load_avg(u64 now, struct rq *rq, int running); + int update_dl_rq_load_avg(u64 now, struct rq *rq, int running); ++#endif + + #ifdef CONFIG_HAVE_SCHED_AVG_IRQ + int update_irq_load_avg(struct rq *rq, u64 running); +@@ -17,6 +19,7 @@ update_irq_load_avg(struct rq *rq, u64 running) + } + #endif + ++#ifndef CONFIG_SCHED_BMQ + /* + * When a task is dequeued, its estimated utilization should not be update if + * its util_avg has not been updated at least once. +@@ -137,9 +140,11 @@ static inline u64 cfs_rq_clock_pelt(struct cfs_rq *cfs_rq) + return rq_clock_pelt(rq_of(cfs_rq)); + } + #endif ++#endif /* CONFIG_SCHED_BMQ */ + + #else + ++#ifndef CONFIG_SCHED_BMQ + static inline int + update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq) + { +@@ -157,6 +162,7 @@ update_dl_rq_load_avg(u64 now, struct rq *rq, int running) + { + return 0; + } ++#endif + + static inline int + update_irq_load_avg(struct rq *rq, u64 running) +diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h +index 280a3c735935..db07c37806bc 100644 +--- a/kernel/sched/sched.h ++++ b/kernel/sched/sched.h +@@ -2,6 +2,10 @@ + /* + * Scheduler internal types and methods: + */ ++#ifdef CONFIG_SCHED_BMQ ++#include "bmq_sched.h" ++#else ++ + #include + + #include +@@ -2487,3 +2491,9 @@ static inline void membarrier_switch_mm(struct rq *rq, + { + } + #endif ++ ++static inline int task_running_nice(struct task_struct *p) ++{ ++ return (task_nice(p) > 0); ++} ++#endif /* !CONFIG_SCHED_BMQ */ +diff --git a/kernel/sched/stats.c b/kernel/sched/stats.c +index 750fb3c67eed..0cc040a28d3f 100644 +--- a/kernel/sched/stats.c ++++ b/kernel/sched/stats.c +@@ -22,8 +22,10 @@ static int show_schedstat(struct seq_file *seq, void *v) + } else { + struct rq *rq; + #ifdef CONFIG_SMP ++#ifndef CONFIG_SCHED_BMQ + struct sched_domain *sd; + int dcount = 0; ++#endif + #endif + cpu = (unsigned long)(v - 2); + rq = cpu_rq(cpu); +@@ -40,6 +42,7 @@ static int show_schedstat(struct seq_file *seq, void *v) + seq_printf(seq, "\n"); + + #ifdef CONFIG_SMP ++#ifndef CONFIG_SCHED_BMQ + /* domain-specific stats */ + rcu_read_lock(); + for_each_domain(cpu, sd) { +@@ -68,6 +71,7 @@ static int show_schedstat(struct seq_file *seq, void *v) + sd->ttwu_move_balance); + } + rcu_read_unlock(); ++#endif + #endif + } + return 0; +diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c +index 6ec1e595b1d4..f02bbaf837b3 100644 +--- a/kernel/sched/topology.c ++++ b/kernel/sched/topology.c +@@ -4,6 +4,7 @@ + */ + #include "sched.h" + ++#ifndef CONFIG_SCHED_BMQ + DEFINE_MUTEX(sched_domains_mutex); + + /* Protected by sched_domains_mutex: */ +@@ -1182,8 +1183,10 @@ static void init_sched_groups_capacity(int cpu, struct sched_domain *sd) + */ + + static int default_relax_domain_level = -1; ++#endif /* CONFIG_SCHED_BMQ */ + int sched_domain_level_max; + ++#ifndef CONFIG_SCHED_BMQ + static int __init setup_relax_domain_level(char *str) + { + if (kstrtoint(str, 0, &default_relax_domain_level)) +@@ -1425,6 +1428,7 @@ sd_init(struct sched_domain_topology_level *tl, + + return sd; + } ++#endif /* CONFIG_SCHED_BMQ */ + + /* + * Topology list, bottom-up. +@@ -1454,6 +1458,7 @@ void set_sched_topology(struct sched_domain_topology_level *tl) + sched_domain_topology = tl; + } + ++#ifndef CONFIG_SCHED_BMQ + #ifdef CONFIG_NUMA + + static const struct cpumask *sd_numa_mask(int cpu) +@@ -2289,3 +2294,17 @@ void partition_sched_domains(int ndoms_new, cpumask_var_t doms_new[], + partition_sched_domains_locked(ndoms_new, doms_new, dattr_new); + mutex_unlock(&sched_domains_mutex); + } ++#else /* CONFIG_SCHED_BMQ */ ++void partition_sched_domains(int ndoms_new, cpumask_var_t doms_new[], ++ struct sched_domain_attr *dattr_new) ++{} ++ ++#ifdef CONFIG_NUMA ++int __read_mostly node_reclaim_distance = RECLAIM_DISTANCE; ++ ++int sched_numa_find_closest(const struct cpumask *cpus, int cpu) ++{ ++ return best_mask_cpu(cpu, cpus); ++} ++#endif /* CONFIG_NUMA */ ++#endif +diff --git a/kernel/sysctl.c b/kernel/sysctl.c +index 70665934d53e..8d0157d9932e 100644 +--- a/kernel/sysctl.c ++++ b/kernel/sysctl.c +@@ -132,6 +132,10 @@ static unsigned long one_ul = 1; + static unsigned long long_max = LONG_MAX; + static int one_hundred = 100; + static int one_thousand = 1000; ++#ifdef CONFIG_SCHED_BMQ ++static int __maybe_unused zero = 0; ++extern int sched_yield_type; ++#endif + #ifdef CONFIG_PRINTK + static int ten_thousand = 10000; + #endif +@@ -300,7 +304,7 @@ static struct ctl_table sysctl_base_table[] = { + { } + }; + +-#ifdef CONFIG_SCHED_DEBUG ++#if defined(CONFIG_SCHED_DEBUG) && !defined(CONFIG_SCHED_BMQ) + static int min_sched_granularity_ns = 100000; /* 100 usecs */ + static int max_sched_granularity_ns = NSEC_PER_SEC; /* 1 second */ + static int min_wakeup_granularity_ns; /* 0 usecs */ +@@ -317,6 +321,7 @@ static int max_extfrag_threshold = 1000; + #endif + + static struct ctl_table kern_table[] = { ++#ifndef CONFIG_SCHED_BMQ + { + .procname = "sched_child_runs_first", + .data = &sysctl_sched_child_runs_first, +@@ -498,6 +503,7 @@ static struct ctl_table kern_table[] = { + .extra2 = SYSCTL_ONE, + }, + #endif ++#endif /* !CONFIG_SCHED_BMQ */ + #ifdef CONFIG_PROVE_LOCKING + { + .procname = "prove_locking", +@@ -1070,6 +1076,17 @@ static struct ctl_table kern_table[] = { + .proc_handler = proc_dointvec, + }, + #endif ++#ifdef CONFIG_SCHED_BMQ ++ { ++ .procname = "yield_type", ++ .data = &sched_yield_type, ++ .maxlen = sizeof (int), ++ .mode = 0644, ++ .proc_handler = &proc_dointvec_minmax, ++ .extra1 = &zero, ++ .extra2 = &two, ++ }, ++#endif + #if defined(CONFIG_S390) && defined(CONFIG_SMP) + { + .procname = "spin_retry", +diff --git a/kernel/time/posix-cpu-timers.c b/kernel/time/posix-cpu-timers.c +index 42d512fcfda2..70b97fe0ff44 100644 +--- a/kernel/time/posix-cpu-timers.c ++++ b/kernel/time/posix-cpu-timers.c +@@ -226,7 +226,7 @@ static void task_sample_cputime(struct task_struct *p, u64 *samples) + u64 stime, utime; + + task_cputime(p, &utime, &stime); +- store_samples(samples, stime, utime, p->se.sum_exec_runtime); ++ store_samples(samples, stime, utime, tsk_seruntime(p)); + } + + static void proc_sample_cputime_atomic(struct task_cputime_atomic *at, +@@ -796,6 +796,7 @@ static void collect_posix_cputimers(struct posix_cputimers *pct, u64 *samples, + } + } + ++#ifndef CONFIG_SCHED_BMQ + static inline void check_dl_overrun(struct task_struct *tsk) + { + if (tsk->dl.dl_overrun) { +@@ -803,6 +804,7 @@ static inline void check_dl_overrun(struct task_struct *tsk) + __group_send_sig_info(SIGXCPU, SEND_SIG_PRIV, tsk); + } + } ++#endif + + static bool check_rlimit(u64 time, u64 limit, int signo, bool rt, bool hard) + { +@@ -830,8 +832,10 @@ static void check_thread_timers(struct task_struct *tsk, + u64 samples[CPUCLOCK_MAX]; + unsigned long soft; + ++#ifndef CONFIG_SCHED_BMQ + if (dl_task(tsk)) + check_dl_overrun(tsk); ++#endif + + if (expiry_cache_is_inactive(pct)) + return; +@@ -845,7 +849,7 @@ static void check_thread_timers(struct task_struct *tsk, + soft = task_rlimit(tsk, RLIMIT_RTTIME); + if (soft != RLIM_INFINITY) { + /* Task RT timeout is accounted in jiffies. RTTIME is usec */ +- unsigned long rttime = tsk->rt.timeout * (USEC_PER_SEC / HZ); ++ unsigned long rttime = tsk_rttimeout(tsk) * (USEC_PER_SEC / HZ); + unsigned long hard = task_rlimit_max(tsk, RLIMIT_RTTIME); + + /* At the hard limit, send SIGKILL. No further action. */ +@@ -1099,8 +1103,10 @@ static inline bool fastpath_timer_check(struct task_struct *tsk) + return true; + } + ++#ifndef CONFIG_SCHED_BMQ + if (dl_task(tsk) && tsk->dl.dl_overrun) + return true; ++#endif + + return false; + } +diff --git a/kernel/trace/trace_selftest.c b/kernel/trace/trace_selftest.c +index 69ee8ef12cee..208788fcbb0e 100644 +--- a/kernel/trace/trace_selftest.c ++++ b/kernel/trace/trace_selftest.c +@@ -1048,10 +1048,15 @@ static int trace_wakeup_test_thread(void *data) + { + /* Make this a -deadline thread */ + static const struct sched_attr attr = { ++#ifdef CONFIG_SCHED_BMQ ++ /* No deadline on BMQ, use RR */ ++ .sched_policy = SCHED_RR, ++#else + .sched_policy = SCHED_DEADLINE, + .sched_runtime = 100000ULL, + .sched_deadline = 10000000ULL, + .sched_period = 10000000ULL ++#endif + }; + struct wakeup_test_data *x = data; + diff --git a/linux55-tkg/linux55-tkg-patches/0009-glitched-bmq.patch b/linux55-tkg/linux55-tkg-patches/0009-glitched-bmq.patch new file mode 100644 index 0000000..38666e4 --- /dev/null +++ b/linux55-tkg/linux55-tkg-patches/0009-glitched-bmq.patch @@ -0,0 +1,90 @@ +From f7f49141a5dbe9c99d78196b58c44307fb2e6be3 Mon Sep 17 00:00:00 2001 +From: Tk-Glitch +Date: Wed, 4 Jul 2018 04:30:08 +0200 +Subject: glitched - BMQ + +diff --git a/kernel/Kconfig.hz b/kernel/Kconfig.hz +index 2a202a846757..1d9c7ed79b11 100644 +--- a/kernel/Kconfig.hz ++++ b/kernel/Kconfig.hz +@@ -4,7 +4,7 @@ + + choice + prompt "Timer frequency" +- default HZ_250 ++ default HZ_500 + help + Allows the configuration of the timer frequency. It is customary + to have the timer interrupt run at 1000 Hz but 100 Hz may be more +@@ -39,6 +39,13 @@ choice + on SMP and NUMA systems and exactly dividing by both PAL and + NTSC frame rates for video and multimedia work. + ++ config HZ_500 ++ bool "500 HZ" ++ help ++ 500 Hz is a balanced timer frequency. Provides fast interactivity ++ on desktops with great smoothness without increasing CPU power ++ consumption and sacrificing the battery life on laptops. ++ + config HZ_1000 + bool "1000 HZ" + help +@@ -52,6 +59,7 @@ config HZ + default 100 if HZ_100 + default 250 if HZ_250 + default 300 if HZ_300 ++ default 500 if HZ_500 + default 1000 if HZ_1000 + + config SCHED_HRTICK + +diff --git a/kernel/Kconfig.hz b/kernel/Kconfig.hz +index 2a202a846757..1d9c7ed79b11 100644 +--- a/kernel/Kconfig.hz ++++ b/kernel/Kconfig.hz +@@ -4,7 +4,7 @@ + + choice + prompt "Timer frequency" +- default HZ_500 ++ default HZ_750 + help + Allows the configuration of the timer frequency. It is customary + to have the timer interrupt run at 1000 Hz but 100 Hz may be more +@@ -46,6 +46,13 @@ choice + on desktops with great smoothness without increasing CPU power + consumption and sacrificing the battery life on laptops. + ++ config HZ_750 ++ bool "750 HZ" ++ help ++ 750 Hz is a good timer frequency for desktops. Provides fast ++ interactivity with great smoothness without sacrificing too ++ much throughput. ++ + config HZ_1000 + bool "1000 HZ" + help +@@ -60,6 +67,7 @@ config HZ + default 250 if HZ_250 + default 300 if HZ_300 + default 500 if HZ_500 ++ default 750 if HZ_750 + default 1000 if HZ_1000 + + config SCHED_HRTICK + +diff --git a/mm/vmscan.c b/mm/vmscan.c +index 9270a4370d54..30d01e647417 100644 +--- a/mm/vmscan.c ++++ b/mm/vmscan.c +@@ -159,7 +159,7 @@ struct scan_control { + /* + * From 0 .. 100. Higher means more swappy. + */ +-int vm_swappiness = 60; ++int vm_swappiness = 20; + /* + * The total number of pages which are beyond the high watermark within all + * zones. diff --git a/linux55-tkg/linux55-tkg-patches/0009-glitched-ondemand-bmq.patch b/linux55-tkg/linux55-tkg-patches/0009-glitched-ondemand-bmq.patch new file mode 100644 index 0000000..6e30518 --- /dev/null +++ b/linux55-tkg/linux55-tkg-patches/0009-glitched-ondemand-bmq.patch @@ -0,0 +1,18 @@ +diff --git a/drivers/cpufreq/cpufreq_ondemand.c b/drivers/cpufreq/cpufreq_ondemand.c +index 6b423eebfd5d..61e3271675d6 100644 +--- a/drivers/cpufreq/cpufreq_ondemand.c ++++ b/drivers/cpufreq/cpufreq_ondemand.c +@@ -21,10 +21,10 @@ + #include "cpufreq_ondemand.h" + + /* On-demand governor macros */ +-#define DEF_FREQUENCY_UP_THRESHOLD (63) +-#define DEF_SAMPLING_DOWN_FACTOR (1) ++#define DEF_FREQUENCY_UP_THRESHOLD (55) ++#define DEF_SAMPLING_DOWN_FACTOR (5) + #define MAX_SAMPLING_DOWN_FACTOR (100000) +-#define MICRO_FREQUENCY_UP_THRESHOLD (95) ++#define MICRO_FREQUENCY_UP_THRESHOLD (63) + #define MICRO_FREQUENCY_MIN_SAMPLE_RATE (10000) + #define MIN_FREQUENCY_UP_THRESHOLD (1) + #define MAX_FREQUENCY_UP_THRESHOLD (100) diff --git a/linux55-tkg/linux55-tkg-patches/0011-ZFS-fix.patch b/linux55-tkg/linux55-tkg-patches/0011-ZFS-fix.patch new file mode 100644 index 0000000..af71d04 --- /dev/null +++ b/linux55-tkg/linux55-tkg-patches/0011-ZFS-fix.patch @@ -0,0 +1,43 @@ +From 1e010beda2896bdf3082fb37a3e49f8ce20e04d8 Mon Sep 17 00:00:00 2001 +From: =?UTF-8?q?J=C3=B6rg=20Thalheim?= +Date: Thu, 2 May 2019 05:28:08 +0100 +Subject: [PATCH] x86/fpu: Export kernel_fpu_{begin,end}() with + EXPORT_SYMBOL_GPL +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +We need these symbols in zfs as the fpu implementation breaks userspace: + +https://github.com/zfsonlinux/zfs/issues/9346 +Signed-off-by: Jörg Thalheim +--- + arch/x86/kernel/fpu/core.c | 4 ++-- + 1 file changed, 2 insertions(+), 2 deletions(-) + +diff --git a/arch/x86/kernel/fpu/core.c b/arch/x86/kernel/fpu/core.c +index 12c70840980e..352538b3bb5d 100644 +--- a/arch/x86/kernel/fpu/core.c ++++ b/arch/x86/kernel/fpu/core.c +@@ -102,7 +102,7 @@ void kernel_fpu_begin(void) + } + __cpu_invalidate_fpregs_state(); + } +-EXPORT_SYMBOL_GPL(kernel_fpu_begin); ++EXPORT_SYMBOL(kernel_fpu_begin); + + void kernel_fpu_end(void) + { +@@ -111,7 +111,7 @@ void kernel_fpu_end(void) + this_cpu_write(in_kernel_fpu, false); + preempt_enable(); + } +-EXPORT_SYMBOL_GPL(kernel_fpu_end); ++EXPORT_SYMBOL(kernel_fpu_end); + + /* + * Save the FPU state (mark it for reload if necessary): +-- +2.23.0 + + diff --git a/linux56-rc-tkg/PKGBUILD b/linux56-rc-tkg/PKGBUILD new file mode 100644 index 0000000..bd2708f --- /dev/null +++ b/linux56-rc-tkg/PKGBUILD @@ -0,0 +1,1114 @@ +# Based on the file created for Arch Linux by: +# Tobias Powalowski +# Thomas Baechler + +# Contributor: Tk-Glitch + +plain ' .---.` `.---.' +plain ' `/syhhhyso- -osyhhhys/`' +plain ' .syNMdhNNhss/``.---.``/sshNNhdMNys.' +plain ' +sdMh.`+MNsssssssssssssssNM+`.hMds+' +plain ' :syNNdhNNhssssssssssssssshNNhdNNys:' +plain ' /ssyhhhysssssssssssssssssyhhhyss/' +plain ' .ossssssssssssssssssssssssssssso.' +plain ' :sssssssssssssssssssssssssssssssss:' +plain ' /sssssssssssssssssssssssssssssssssss/' +plain ' :sssssssssssssoosssssssoosssssssssssss:' +plain ' osssssssssssssoosssssssoossssssssssssso' +plain ' osssssssssssyyyyhhhhhhhyyyyssssssssssso' +plain ' /yyyyyyhhdmmmmNNNNNNNNNNNmmmmdhhyyyyyy/' +plain ' smmmNNNNNNNNNNNNNNNNNNNNNNNNNNNNNmmms' +plain ' /dNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNd/' +plain ' `:sdNNNNNNNNNNNNNNNNNNNNNNNNNds:`' +plain ' `-+shdNNNNNNNNNNNNNNNdhs+-`' +plain ' `.-:///////:-.`' + +_where=$PWD # track basedir as different Arch based distros are moving srcdir around + +cp "$_where"/linux56-tkg-patches/* "$_where" # copy patches inside the PKGBUILD's dir to preserve makepkg sourcing and md5sum checking +cp "$_where"/linux56-tkg-config/* "$_where" # copy config files and hooks inside the PKGBUILD's dir to preserve makepkg sourcing and md5sum checking + +source "$_where"/customization.cfg # load default configuration from file + +# Load external configuration file if present. Available variable values will overwrite customization.cfg ones. +if [ -e "$_EXT_CONFIG_PATH" ]; then + source "$_EXT_CONFIG_PATH" && msg2 "External configuration file $_EXT_CONFIG_PATH will be used to override customization.cfg values." && msg2 "" +fi + +if [ -z "$_OPTIPROFILE" ] && [ ! -e "$_where"/cpuschedset ]; then + # Prompt about optimized configurations. Available variable values will overwrite customization.cfg/external config ones. + plain "Do you want to use a predefined optimized profile?" + read -rp "`echo $' > 1.Custom\n 2.Ryzen Desktop (Performance)\n 3.Other Desktop (Performance)\nchoice[1-3?]: '`" _OPTIPROFILE; +fi +if [ "$_OPTIPROFILE" == "2" ]; then + source "$_where"/ryzen-desktop-profile.cfg && msg2 "Ryzen Desktop (Performance) profile will be used." && msg2 "" +elif [ "$_OPTIPROFILE" == "3" ]; then + source "$_where"/generic-desktop-profile.cfg && msg2 "Generic Desktop (Performance) profile will be used." && msg2 "" +fi + +# source cpuschedset early if present +if [ -e "$_where"/cpuschedset ]; then + source "$_where"/cpuschedset +fi + +# CPU SCHED selector +if [ -z "$_cpusched" ] && [ ! -e "$_where"/cpuschedset ]; then + plain "What CPU sched variant do you want to build/install?" + read -rp "`echo $' > 1.PDS\n 2.MuQSS\n 3.BMQ\n 4.CFS\nchoice[1-4?]: '`" CONDITION; + if [ "$CONDITION" == "2" ]; then + echo "_cpusched=\"MuQSS\"" > "$_where"/cpuschedset + elif [ "$CONDITION" == "3" ]; then + echo "_cpusched=\"bmq\"" > "$_where"/cpuschedset + elif [ "$CONDITION" == "4" ]; then + echo "_cpusched=\"cfs\"" > "$_where"/cpuschedset + else + echo "_cpusched=\"pds\"" > "$_where"/cpuschedset + fi + if [ -n "$_custom_pkgbase" ]; then + echo "_custom_pkgbase=\"${_custom_pkgbase}\"" >> "$_where"/cpuschedset + fi +elif [ "$_cpusched" == "muqss" ] || [ "$_cpusched" == "MuQSS" ]; then + echo "_cpusched=\"MuQSS\"" > "$_where"/cpuschedset +elif [ "$_cpusched" == "pds" ]; then + echo "_cpusched=\"pds\"" > "$_where"/cpuschedset +elif [ "$_cpusched" == "bmq" ]; then + echo "_cpusched=\"bmq\"" > "$_where"/cpuschedset +else + echo "_cpusched=\"cfs\"" > "$_where"/cpuschedset +fi + +source "$_where"/cpuschedset + +_basever=56 +if [ -n "$_custom_pkgbase" ]; then + pkgbase="${_custom_pkgbase}" +else + pkgbase=linux"${_basever}"-tkg-"${_cpusched}" +fi +pkgname=("${pkgbase}" "${pkgbase}-headers") +_basekernel=5.6 +_sub=rc7 +pkgver="${_basekernel}"."${_sub}" +pkgrel=1 +pkgdesc='Linux-tkg' +arch=('x86_64') # no i686 in here +url="http://www.kernel.org/" +license=('GPL2') +makedepends=('xmlto' 'docbook-xsl' 'kmod' 'inetutils' 'bc' 'libelf' 'patchutils' 'flex' 'python-sphinx' 'python-sphinx_rtd_theme' 'graphviz' 'imagemagick' 'git') +optdepends=('schedtool') +if pacman -Qq ccache &> /dev/null; then + msg2 'ccache was found and will be used' + options=('!strip' 'ccache') +else + msg2 'ccache was not found and will not be used' + options=('!strip') +fi +source=("https://git.kernel.org/torvalds/t/linux-${_basekernel}-${_sub}.tar.gz" + 'config.x86_64' # stock Arch config + #'config_hardened.x86_64' # hardened Arch config + # ARCH Patches + 0001-add-sysctl-to-disallow-unprivileged-CLONE_NEWUSER-by.patch + # TkG + 0002-clear-patches.patch + 0003-glitched-base.patch + 0003-glitched-cfs.patch + 0004-glitched-ondemand-muqss.patch + 0004-glitched-muqss.patch + 0004-5.6-ck1.patch + 0005-glitched-ondemand-pds.patch + 0005-glitched-pds.patch + 0005-v5.6_undead-pds099o.patch + 0006-add-acs-overrides_iommu.patch + 0007-v5.6-fsync.patch + #0008-5.6-bcachefs.patch + 0009-glitched-ondemand-bmq.patch + 0009-glitched-bmq.patch + 0009-bmq_v5.6-r0.patch + 0011-ZFS-fix.patch + #0012-linux-hardened.patch + 0013-tp_smapi_ec.patch +) +sha256sums=('d18c846ef07a699561d3884a34f10941aed4e079c5f2a6fc151e9c54dd8d6373' + '1f2c39041b00ad8cddc3a17b589ccdcd4e9e4524bd47cf9f6292e2bbc105398b' + '31dc68e84aecfb7d069efb1305049122c65694676be8b955634abcf0675922a2' + 'd02bf5ca08fd610394b9d3a0c3b176d74af206f897dee826e5cbaec97bb4a4aa' + 'dd5236f4109193dc518cf6e0a490600ae613c24232011f59d4069ce48ece32bd' + '7058e57fd68367b029adc77f2a82928f1433daaf02c8c279cb2d13556c8804d7' + 'c605f638d74c61861ebdc36ebd4cb8b6475eae2f6273e1ccb2bbb3e10a2ec3fe' + 'bc69d6e5ee8172b0242c8fa72d13cfe2b8d2b6601468836908a7dfe8b78a3bbb' + 'd0509062a9939cc3a59c86b34a77b9d428dcea344a53a84e2cddd3d48a21591d' + '62496f9ca788996181ef145f96ad26291282fcc3fb95cdc04080dcf84365be33' + '7fd8e776209dac98627453fda754bdf9aff4a09f27cb0b3766d7983612eb3c74' + '5255ec9387d85a1e1f50b5d7f6ac9983111b31f6e437153b7fab49bb31fdf000' + '90917e09bb06fbed6853efe9e52f8c2ba4066fca44accdf7608222212561104a' + '2d9260b80b43bbd605cf420d6bd53aa7262103dfd77196ba590ece5600b6dc0d' + 'e27ad5ff23a81b5be73a642db5186b447f336956a427d1300e8ccc49abf0dd74' + '965a517a283f265a012545fbb5cc9e516efc9f6166d2aa1baf7293a32a1086b7' + '1a647aa24074af0cc3a0ecbf8c720ab496be9fe1a62fab41c524eb6cc5dcccda' + '49262ce4a8089fa70275aad742fc914baa28d9c384f710c9a62f64796d13e104' + '5fe8b22389d9df109f80fc4785908d1c32f1d469f5ef32fee613a0937965469e') + +export KBUILD_BUILD_HOST=archlinux +export KBUILD_BUILD_USER=$pkgbase +export KBUILD_BUILD_TIMESTAMP="$(date -Ru${SOURCE_DATE_EPOCH:+d @$SOURCE_DATE_EPOCH})" + +user_patcher() { + # To patch the user because all your base are belong to us + local _patches=("$_where"/*."${_userpatch_ext}revert") + if [ ${#_patches[@]} -ge 2 ] || [ -e "${_patches}" ]; then + if [ "$_user_patches_no_confirm" != "true" ]; then + msg2 "Found ${#_patches[@]} 'to revert' userpatches for ${_userpatch_target}:" + printf '%s\n' "${_patches[@]}" + read -rp "Do you want to install it/them? - Be careful with that ;)"$'\n> N/y : ' _CONDITION; + fi + if [ "$_CONDITION" == "y" ] || [ "$_user_patches_no_confirm" == "true" ]; then + for _f in "${_patches[@]}"; do + if [ -e "${_f}" ]; then + msg2 "######################################################" + msg2 "" + msg2 "Reverting your own ${_userpatch_target} patch ${_f}" + msg2 "" + msg2 "######################################################" + patch -Np1 -R < "${_f}" + echo "Reverted your own patch ${_f}" >> "$_where"/last_build_config.log + fi + done + fi + fi + + _patches=("$_where"/*."${_userpatch_ext}patch") + if [ ${#_patches[@]} -ge 2 ] || [ -e "${_patches}" ]; then + if [ "$_user_patches_no_confirm" != "true" ]; then + msg2 "Found ${#_patches[@]} userpatches for ${_userpatch_target}:" + printf '%s\n' "${_patches[@]}" + read -rp "Do you want to install it/them? - Be careful with that ;)"$'\n> N/y : ' _CONDITION; + fi + if [ "$_CONDITION" == "y" ] || [ "$_user_patches_no_confirm" == "true" ]; then + for _f in "${_patches[@]}"; do + if [ -e "${_f}" ]; then + msg2 "######################################################" + msg2 "" + msg2 "Applying your own ${_userpatch_target} patch ${_f}" + msg2 "" + msg2 "######################################################" + patch -Np1 < "${_f}" + echo "Applied your own patch ${_f}" >> "$_where"/last_build_config.log + fi + done + fi + fi +} + +prepare() { + rm -rf $pkgdir # Nuke the entire pkg folder so it'll get regenerated clean on next build + + ln -s "${_where}/customization.cfg" "${srcdir}" # workaround + + cd "${srcdir}/linux-${_basekernel}-${_sub}" + + msg2 "Setting version..." + scripts/setlocalversion --save-scmversion + echo "-$pkgrel-tkg-${_cpusched}" > localversion.10-pkgrel + echo "" > localversion.20-pkgname + + # add upstream patch + #patch -p1 -i ../patch-"${pkgver}" + + # ARCH Patches + if [ "${_configfile}" == "config_hardened.x86_64" ] && [ "${_cpusched}" == "cfs" ]; then + msg2 "Using linux hardened patchset" + patch -Np1 -i ../0012-linux-hardened.patch + else + patch -Np1 -i ../0001-add-sysctl-to-disallow-unprivileged-CLONE_NEWUSER-by.patch + fi + + # TkG + patch -Np1 -i ../0002-clear-patches.patch + + patch -Np1 -i ../0003-glitched-base.patch + + if [ "${_tp_smapi_ec}" != "false" ]; then + patch -Np1 -i ../0013-tp_smapi_ec.patch + fi + + if [ "${_cpusched}" == "MuQSS" ]; then + # MuQSS + patch -Np1 -i ../0004-5.6-ck1.patch + if [ "${_aggressive_ondemand}" == "true" ]; then + patch -Np1 -i ../0004-glitched-ondemand-muqss.patch + fi + patch -Np1 -i ../0004-glitched-muqss.patch + elif [ "${_cpusched}" == "pds" ]; then + # PDS-mq + patch -Np1 -i ../0005-v5.6_undead-pds099o.patch + if [ "${_aggressive_ondemand}" == "true" ]; then + patch -Np1 -i ../0005-glitched-ondemand-pds.patch + fi + patch -Np1 -i ../0005-glitched-pds.patch + elif [ "${_cpusched}" == "bmq" ]; then + # BMQ + patch -Np1 -i ../0009-bmq_v5.6-r0.patch + if [ "${_aggressive_ondemand}" == "true" ]; then + patch -Np1 -i ../0009-glitched-ondemand-bmq.patch + fi + patch -Np1 -i ../0009-glitched-bmq.patch + else + patch -Np1 -i ../0003-glitched-cfs.patch + fi + + if [ -z "${_configfile}" ]; then + _configfile="config.x86_64" + fi + + cat "${srcdir}/${_configfile}" > ./.config + + # Set some -tkg defaults + echo "# CONFIG_DYNAMIC_FAULT is not set" >> ./.config + sed -i -e 's/CONFIG_DEFAULT_FQ_CODEL=y/# CONFIG_DEFAULT_FQ_CODEL is not set/' ./.config + echo "CONFIG_DEFAULT_CAKE=y" >> ./.config + echo "CONFIG_NR_TTY_DEVICES=63" >> ./.config + echo "CONFIG_TP_SMAPI=m" >> ./.config + echo "# CONFIG_NTP_PPS is not set" >> ./.config + sed -i -e 's/CONFIG_CRYPTO_LZ4=m/CONFIG_CRYPTO_LZ4=y/' ./.config + sed -i -e 's/CONFIG_CRYPTO_LZ4HC=m/CONFIG_CRYPTO_LZ4HC=y/' ./.config + sed -i -e 's/CONFIG_LZ4_COMPRESS=m/CONFIG_LZ4_COMPRESS=y/' ./.config + sed -i -e 's/CONFIG_LZ4HC_COMPRESS=m/CONFIG_LZ4HC_COMPRESS=y/' ./.config + #sed -i -e 's/CONFIG_RCU_BOOST_DELAY=500/CONFIG_RCU_BOOST_DELAY=0/' ./.config + sed -i -e 's/# CONFIG_CMDLINE_BOOL is not set/CONFIG_CMDLINE_BOOL=y/' ./.config + echo "CONFIG_CMDLINE=\"${_custom_commandline}\"" >> ./.config + echo "# CONFIG_CMDLINE_OVERRIDE is not set" >> ./.config + + # Inject cpuopts options + echo "# CONFIG_MK8SSE3 is not set" >> ./.config + echo "# CONFIG_MK10 is not set" >> ./.config + echo "# CONFIG_MBARCELONA is not set" >> ./.config + echo "# CONFIG_MBOBCAT is not set" >> ./.config + echo "# CONFIG_MJAGUAR is not set" >> ./.config + echo "# CONFIG_MBULLDOZER is not set" >> ./.config + echo "# CONFIG_MPILEDRIVER is not set" >> ./.config + echo "# CONFIG_MSTEAMROLLER is not set" >> ./.config + echo "# CONFIG_MEXCAVATOR is not set" >> ./.config + echo "# CONFIG_MZEN is not set" >> ./.config + echo "# CONFIG_MZEN2 is not set" >> ./.config + echo "# CONFIG_MATOM is not set" >> ./.config + echo "# CONFIG_MNEHALEM is not set" >> ./.config + echo "# CONFIG_MWESTMERE is not set" >> ./.config + echo "# CONFIG_MSILVERMONT is not set" >> ./.config + echo "# CONFIG_MSANDYBRIDGE is not set" >> ./.config + echo "# CONFIG_MIVYBRIDGE is not set" >> ./.config + echo "# CONFIG_MHASWELL is not set" >> ./.config + echo "# CONFIG_MBROADWELL is not set" >> ./.config + echo "# CONFIG_MSKYLAKE is not set" >> ./.config + echo "# CONFIG_MSKYLAKEX is not set" >> ./.config + echo "# CONFIG_MCANNONLAKE is not set" >> ./.config + echo "# CONFIG_MICELAKE is not set" >> ./.config + echo "# CONFIG_MGOLDMONT is not set" >> ./.config + echo "# CONFIG_MGOLDMONTPLUS is not set" >> ./.config + echo "# CONFIG_MCASCADELAKE is not set" >> ./.config + + # Disable some debugging + if [ "${_debugdisable}" == "true" ]; then + sed -i -e 's/CONFIG_SLUB_DEBUG=y/# CONFIG_SLUB_DEBUG is not set/' ./.config + sed -i -e 's/CONFIG_PM_DEBUG=y/# CONFIG_PM_DEBUG is not set/' ./.config + sed -i -e 's/CONFIG_PM_ADVANCED_DEBUG=y/# CONFIG_PM_ADVANCED_DEBUG is not set/' ./.config + sed -i -e 's/CONFIG_PM_SLEEP_DEBUG=y/# CONFIG_PM_SLEEP_DEBUG is not set/' ./.config + sed -i -e 's/CONFIG_ACPI_DEBUG=y/# CONFIG_ACPI_DEBUG is not set/' ./.config + sed -i -e 's/CONFIG_SCHED_DEBUG=y/# CONFIG_SCHED_DEBUG is not set/' ./.config + sed -i -e 's/CONFIG_DEBUG_PREEMPT=y/# CONFIG_DEBUG_PREEMPT is not set/' ./.config + fi + + if [ "${_cpusched}" == "MuQSS" ]; then + # MuQSS default config + echo "CONFIG_SCHED_MUQSS=y" >> ./.config + elif [ "${_cpusched}" == "pds" ]; then + # PDS default config + echo "CONFIG_SCHED_PDS=y" >> ./.config + elif [ "${_cpusched}" == "bmq" ]; then + # BMQ default config + echo "CONFIG_SCHED_BMQ=y" >> ./.config + fi + + if [ "${_cpusched}" == "MuQSS" ] || [ "${_cpusched}" == "pds" ] || [ "${_cpusched}" == "bmq" ]; then + # Disable CFS + sed -i -e 's/CONFIG_FAIR_GROUP_SCHED=y/# CONFIG_FAIR_GROUP_SCHED is not set/' ./.config + sed -i -e 's/CONFIG_CFS_BANDWIDTH=y/# CONFIG_CFS_BANDWIDTH is not set/' ./.config + # sched yield type + if [ -n "$_sched_yield_type" ]; then + CONDITION0="$_sched_yield_type" + else + plain "" + plain "CPU sched_yield_type - Choose what sort of yield sched_yield will perform." + plain "" + plain "For PDS and MuQSS:" + plain "0: No yield." + plain "1: Yield only to better priority/deadline tasks." + plain "2: Expire timeslice and recalculate deadline." + plain "" + plain "For BMQ (experimental) - No recommended value yet, so try for yourself x) :" + plain "0: No yield." + plain "1: Deboost and requeue task. (default)" + plain "2: Set rq skip task." + read -rp "`echo $'\n > 0. Recommended option for gaming on PDS and MuQSS - "tkg" default\n 1. Default, but can lead to stability issues on some platforms\n 2. Can be a good option with low rr_interval on MuQSS\n [0-2?]: '`" CONDITION0; + fi + if [ "$CONDITION0" == "1" ]; then + msg2 "Using default CPU sched yield type (1)" + elif [ "$CONDITION0" == "2" ]; then + sed -i -e 's/int sched_yield_type __read_mostly = 1;/int sched_yield_type __read_mostly = 2;/' ./kernel/sched/"${_cpusched}".c + else + sed -i -e 's/int sched_yield_type __read_mostly = 1;/int sched_yield_type __read_mostly = 0;/' ./kernel/sched/"${_cpusched}".c + fi + fi + + # Round Robin interval + if [ "${_cpusched}" == "MuQSS" ] || [ "${_cpusched}" == "pds" ] || [ "${_cpusched}" == "bmq" ]; then + if [ -n "$_rr_interval" ]; then + CONDITION1="$_rr_interval" + else + plain "" + plain "Round Robin interval is the longest duration two tasks with the same nice level will" + plain "be delayed for. When CPU time is requested by a task, it receives a time slice equal" + plain "to the rr_interval in addition to a virtual deadline. When using yield_type 2, a low" + plain "value can help offset the disadvantages of rescheduling a process that has yielded." + plain "" + plain "MuQSS default: 6ms" + plain "PDS default: 4ms" + plain "BMQ default: 2ms" + read -rp "`echo $'\n > 0.Keep defaults\n 1.2ms\n 2.4ms\n 3.6ms\n 4.8ms\n [0-4?]: '`" CONDITION1; + fi + if [ "$CONDITION1" == "1" ]; then + msg2 "Using 2ms rr_interval" + _rrvalue="2" + elif [ "$CONDITION1" == "2" ]; then + msg2 "Using 4ms rr_interval" + _rrvalue="4" + elif [ "$CONDITION1" == "3" ]; then + msg2 "Using 6ms rr_interval" + _rrvalue="6" + elif [ "$CONDITION1" == "4" ]; then + msg2 "Using 8ms rr_interval" + _rrvalue="8" + else + msg2 "Using default rr_interval" + _rrvalue="default" + fi + if [ "$_rrvalue" != "default" ]; then + if [ "${_cpusched}" == "MuQSS" ]; then + sed -i -e "s/int rr_interval __read_mostly = 6;/int rr_interval __read_mostly = ${_rrvalue};/" ./kernel/sched/"${_cpusched}".c + elif [ "${_cpusched}" == "pds" ]; then + sed -i -e "s/#define SCHED_DEFAULT_RR (4)/#define SCHED_DEFAULT_RR (${_rrvalue})/" ./kernel/sched/"${_cpusched}".c + elif [ "${_cpusched}" == "bmq" ]; then + sed -i -e "s/u64 sched_timeslice_ns __read_mostly = (4 * 1000 * 1000);/u64 sched_timeslice_ns __read_mostly = (${_rrvalue} * 1000 * 1000);/" ./kernel/sched/"${_cpusched}".c + fi + else + if [ "${_cpusched}" == "bmq" ]; then + sed -i -e "s/u64 sched_timeslice_ns __read_mostly = (4 * 1000 * 1000);/u64 sched_timeslice_ns __read_mostly = (2 * 1000 * 1000);/" ./kernel/sched/"${_cpusched}".c + fi + fi + fi + + # zenify + if [ "$_zenify" == "true" ]; then + echo "CONFIG_ZENIFY=y" >> ./.config + elif [ "$_zenify" == "false" ]; then + echo "# CONFIG_ZENIFY is not set" >> ./.config + fi + + # compiler optimization level + if [ "$_compileroptlevel" == "1" ]; then + echo "# CONFIG_CC_OPTIMIZE_FOR_PERFORMANCE_O3 is not set" >> ./.config + elif [ "$_compileroptlevel" == "2" ]; then + sed -i -e 's/CONFIG_CC_OPTIMIZE_FOR_PERFORMANCE=y/# CONFIG_CC_OPTIMIZE_FOR_PERFORMANCE is not set/' ./.config + echo "CONFIG_CC_OPTIMIZE_FOR_PERFORMANCE_O3=y" >> ./.config + elif [ "$_compileroptlevel" == "3" ]; then + sed -i -e 's/CONFIG_CC_OPTIMIZE_FOR_PERFORMANCE=y/# CONFIG_CC_OPTIMIZE_FOR_PERFORMANCE is not set/' ./.config + sed -i -e 's/# CONFIG_CC_OPTIMIZE_FOR_SIZE is not set/CONFIG_CC_OPTIMIZE_FOR_SIZE=y/' ./.config + echo "# CONFIG_CC_OPTIMIZE_FOR_PERFORMANCE_O3 is not set" >> ./.config + fi + + # cpu opt + if [ -n "$_processor_opt" ] && [ "$_processor_opt" != "native" ]; then + echo "# CONFIG_MNATIVE is not set" >> ./.config + fi + + if [ -n "$_processor_opt" ] && [ "$_processor_opt" != "generic" ]; then + sed -i -e 's/CONFIG_GENERIC_CPU=y/# CONFIG_GENERIC_CPU is not set/' ./.config + fi + + if [ "$_processor_opt" == "native" ]; then + echo "CONFIG_MNATIVE=y" >> ./.config + elif [ "$_processor_opt" == "k8" ]; then + sed -i -e 's/# CONFIG_MK8 is not set/CONFIG_MK8=y/' ./.config + elif [ "$_processor_opt" == "k8sse3" ]; then + sed -i -e 's/# CONFIG_MK8SSE3 is not set/CONFIG_MK8SSE3=y/' ./.config + elif [ "$_processor_opt" == "k10" ]; then + sed -i -e 's/# CONFIG_MK10 is not set/CONFIG_MK10=y/' ./.config + elif [ "$_processor_opt" == "barcelona" ]; then + sed -i -e 's/# CONFIG_MBARCELONA is not set/CONFIG_MBARCELONA=y/' ./.config + elif [ "$_processor_opt" == "bobcat" ]; then + sed -i -e 's/# CONFIG_MBOBCAT is not set/CONFIG_MBOBCAT=y/' ./.config + elif [ "$_processor_opt" == "jaguar" ]; then + sed -i -e 's/# CONFIG_MJAGUAR is not set/CONFIG_MJAGUAR=y/' ./.config + elif [ "$_processor_opt" == "bulldozer" ]; then + sed -i -e 's/# CONFIG_MBULLDOZER is not set/CONFIG_MBULLDOZER=y/' ./.config + elif [ "$_processor_opt" == "piledriver" ]; then + sed -i -e 's/# CONFIG_MPILEDRIVER is not set/CONFIG_MPILEDRIVER=y/' ./.config + elif [ "$_processor_opt" == "steamroller" ]; then + sed -i -e 's/# CONFIG_MSTEAMROLLER is not set/CONFIG_MSTEAMROLLER=y/' ./.config + elif [ "$_processor_opt" == "excavator" ]; then + sed -i -e 's/# CONFIG_MEXCAVATOR is not set/CONFIG_MEXCAVATOR=y/' ./.config + elif [ "$_processor_opt" == "zen" ]; then + sed -i -e 's/# CONFIG_MZEN is not set/CONFIG_MZEN=y/' ./.config + elif [ "$_processor_opt" == "zen2" ]; then + sed -i -e 's/# CONFIG_MZEN2 is not set/CONFIG_MZEN2=y/' ./.config + elif [ "$_processor_opt" == "mpsc" ]; then + sed -i -e 's/# CONFIG_MPSC is not set/CONFIG_MPSC=y/' ./.config + elif [ "$_processor_opt" == "atom" ]; then + sed -i -e 's/# CONFIG_MATOM is not set/CONFIG_MATOM=y/' ./.config + elif [ "$_processor_opt" == "core2" ]; then + sed -i -e 's/# CONFIG_MCORE2 is not set/CONFIG_MCORE2=y/' ./.config + elif [ "$_processor_opt" == "nehalem" ]; then + sed -i -e 's/# CONFIG_MNEHALEM is not set/CONFIG_MNEHALEM=y/' ./.config + elif [ "$_processor_opt" == "westmere" ]; then + sed -i -e 's/# CONFIG_MWESTMERE is not set/CONFIG_MWESTMERE=y/' ./.config + elif [ "$_processor_opt" == "silvermont" ]; then + sed -i -e 's/# CONFIG_MSILVERMONT is not set/CONFIG_MSILVERMONT=y/' ./.config + elif [ "$_processor_opt" == "sandybridge" ]; then + sed -i -e 's/# CONFIG_MSANDYBRIDGE is not set/CONFIG_MSANDYBRIDGE=y/' ./.config + elif [ "$_processor_opt" == "ivybridge" ]; then + sed -i -e 's/# CONFIG_MIVYBRIDGE is not set/CONFIG_MIVYBRIDGE=y/' ./.config + elif [ "$_processor_opt" == "haswell" ]; then + sed -i -e 's/# CONFIG_MHASWELL is not set/CONFIG_MHASWELL=y/' ./.config + elif [ "$_processor_opt" == "broadwell" ]; then + sed -i -e 's/# CONFIG_MBROADWELL is not set/CONFIG_MBROADWELL=y/' ./.config + elif [ "$_processor_opt" == "skylake" ]; then + sed -i -e 's/# CONFIG_MSKYLAKE is not set/CONFIG_MSKYLAKE=y/' ./.config + elif [ "$_processor_opt" == "skylakex" ]; then + sed -i -e 's/# CONFIG_MSKYLAKEX is not set/CONFIG_MSKYLAKEX=y/' ./.config + elif [ "$_processor_opt" == "cannonlake" ]; then + sed -i -e 's/# CONFIG_MCANNONLAKE is not set/CONFIG_MCANNONLAKE=y/' ./.config + elif [ "$_processor_opt" == "icelake" ]; then + sed -i -e 's/# CONFIG_MICELAKE is not set/CONFIG_MICELAKE=y/' ./.config + elif [ "$_processor_opt" == "goldmont" ]; then + sed -i -e 's/# CONFIG_MGOLDMONT is not set/CONFIG_MGOLDMONT=y/' ./.config + elif [ "$_processor_opt" == "goldmontplus" ]; then + sed -i -e 's/# CONFIG_MGOLDMONTPLUS is not set/CONFIG_MGOLDMONTPLUS=y/' ./.config + elif [ "$_processor_opt" == "cascadelake" ]; then + sed -i -e 's/# CONFIG_MCASCADELAKE is not set/CONFIG_MCASCADELAKE=y/' ./.config + fi + + # irq threading + if [ "$_irq_threading" == "true" ]; then + echo "CONFIG_FORCE_IRQ_THREADING=y" >> ./.config + elif [ "$_irq_threading" == "false" ]; then + echo "# CONFIG_FORCE_IRQ_THREADING is not set" >> ./.config + fi + + # smt nice + if [ "$_smt_nice" == "true" ]; then + echo "CONFIG_SMT_NICE=y" >> ./.config + elif [ "$_smt_nice" == "false" ]; then + echo "# CONFIG_SMT_NICE is not set" >> ./.config + fi + + # random trust cpu + if [ "$_random_trust_cpu" == "true" ]; then + sed -i -e 's/# CONFIG_RANDOM_TRUST_CPU is not set/CONFIG_RANDOM_TRUST_CPU=y/' ./.config + fi + + # rq sharing + if [ "$_runqueue_sharing" == "none" ]; then + echo -e "CONFIG_RQ_NONE=y\n# CONFIG_RQ_SMT is not set\n# CONFIG_RQ_MC is not set\n# CONFIG_RQ_MC_LLC is not set\n# CONFIG_RQ_SMP is not set\n# CONFIG_RQ_ALL is not set" >> ./.config + elif [ -z "$_runqueue_sharing" ] || [ "$_runqueue_sharing" == "smt" ]; then + echo -e "# CONFIG_RQ_NONE is not set\nCONFIG_RQ_SMT=y\n# CONFIG_RQ_MC is not set\n# CONFIG_RQ_MC_LLC is not set\n# CONFIG_RQ_SMP is not set\n# CONFIG_RQ_ALL is not set" >> ./.config + elif [ "$_runqueue_sharing" == "mc" ]; then + echo -e "# CONFIG_RQ_NONE is not set\n# CONFIG_RQ_SMT is not set\nCONFIG_RQ_MC=y\n# CONFIG_RQ_MC_LLC is not set\n# CONFIG_RQ_SMP is not set\n# CONFIG_RQ_ALL is not set" >> ./.config + elif [ "$_runqueue_sharing" == "smp" ]; then + echo -e "# CONFIG_RQ_NONE is not set\n# CONFIG_RQ_SMT is not set\n# CONFIG_RQ_MC is not set\n# CONFIG_RQ_MC_LLC is not set\nCONFIG_RQ_SMP=y\n# CONFIG_RQ_ALL is not set" >> ./.config + elif [ "$_runqueue_sharing" == "all" ]; then + echo -e "# CONFIG_RQ_NONE is not set\n# CONFIG_RQ_SMT is not set\n# CONFIG_RQ_MC is not set\n# CONFIG_RQ_MC_LLC is not set\n# CONFIG_RQ_SMP is not set\nCONFIG_RQ_ALL=y" >> ./.config + elif [ "$_runqueue_sharing" == "mc-llc" ]; then + echo -e "# CONFIG_RQ_NONE is not set\n# CONFIG_RQ_SMT is not set\n# CONFIG_RQ_MC is not set\nCONFIG_RQ_MC_LLC=y\n# CONFIG_RQ_SMP is not set\n# CONFIG_RQ_ALL is not set" >> ./.config + fi + + # timer freq + if [ -n "$_timer_freq" ] && [ "$_timer_freq" != "300" ]; then + sed -i -e 's/CONFIG_HZ_300=y/# CONFIG_HZ_300 is not set/' ./.config + sed -i -e 's/CONFIG_HZ_300_NODEF=y/# CONFIG_HZ_300_NODEF is not set/' ./.config + if [ "$_timer_freq" == "1000" ]; then + sed -i -e 's/# CONFIG_HZ_1000 is not set/CONFIG_HZ_1000=y/' ./.config + sed -i -e 's/CONFIG_HZ=300/CONFIG_HZ=1000/' ./.config + echo "# CONFIG_HZ_500 is not set" >> ./.config + echo "# CONFIG_HZ_500_NODEF is not set" >> ./.config + echo "# CONFIG_HZ_750 is not set" >> ./.config + echo "# CONFIG_HZ_750_NODEF is not set" >> ./.config + echo "CONFIG_HZ_1000_NODEF=y" >> ./.config + echo "# CONFIG_HZ_250_NODEF is not set" >> ./.config + echo "# CONFIG_HZ_300_NODEF is not set" >> ./.config + elif [ "$_timer_freq" == "750" ]; then + sed -i -e 's/CONFIG_HZ=300/CONFIG_HZ=750/' ./.config + echo "# CONFIG_HZ_500 is not set" >> ./.config + echo "# CONFIG_HZ_500_NODEF is not set" >> ./.config + echo "CONFIG_HZ_750=y" >> ./.config + echo "CONFIG_HZ_750_NODEF=y" >> ./.config + echo "# CONFIG_HZ_1000_NODEF is not set" >> ./.config + echo "# CONFIG_HZ_250_NODEF is not set" >> ./.config + echo "# CONFIG_HZ_300_NODEF is not set" >> ./.config + elif [ "$_timer_freq" == "500" ]; then + sed -i -e 's/CONFIG_HZ=300/CONFIG_HZ=500/' ./.config + echo "CONFIG_HZ_500=y" >> ./.config + echo "CONFIG_HZ_500_NODEF=y" >> ./.config + echo "# CONFIG_HZ_750 is not set" >> ./.config + echo "# CONFIG_HZ_750_NODEF is not set" >> ./.config + echo "# CONFIG_HZ_1000_NODEF is not set" >> ./.config + echo "# CONFIG_HZ_250_NODEF is not set" >> ./.config + echo "# CONFIG_HZ_300_NODEF is not set" >> ./.config + elif [ "$_timer_freq" == "100" ]; then + sed -i -e 's/CONFIG_HZ=300/CONFIG_HZ=100/' ./.config + echo "# CONFIG_HZ_500 is not set" >> ./.config + echo "# CONFIG_HZ_750 is not set" >> ./.config + echo "# CONFIG_HZ_1000_NODEF is not set" >> ./.config + echo "# CONFIG_HZ_750_NODEF is not set" >> ./.config + echo "# CONFIG_HZ_500_NODEF is not set" >> ./.config + echo "# CONFIG_HZ_250_NODEF is not set" >> ./.config + echo "# CONFIG_HZ_300_NODEF is not set" >> ./.config + echo "CONFIG_HZ_100=y" >> ./.config + echo "CONFIG_HZ_100_NODEF=y" >> ./.config + fi + elif [ "${_cpusched}" == "MuQSS" ] && [ -z "$_timer_freq" ]; then + sed -i -e 's/CONFIG_HZ=300/CONFIG_HZ=100/' ./.config + echo "# CONFIG_HZ_500 is not set" >> ./.config + echo "# CONFIG_HZ_750 is not set" >> ./.config + echo "# CONFIG_HZ_1000_NODEF is not set" >> ./.config + echo "# CONFIG_HZ_750_NODEF is not set" >> ./.config + echo "# CONFIG_HZ_500_NODEF is not set" >> ./.config + echo "# CONFIG_HZ_250_NODEF is not set" >> ./.config + echo "# CONFIG_HZ_300_NODEF is not set" >> ./.config + echo "CONFIG_HZ_100=y" >> ./.config + echo "CONFIG_HZ_100_NODEF=y" >> ./.config + else + sed -i -e 's/CONFIG_HZ_300=y/# CONFIG_HZ_300 is not set/' ./.config + sed -i -e 's/CONFIG_HZ_300_NODEF=y/# CONFIG_HZ_300_NODEF is not set/' ./.config + sed -i -e 's/CONFIG_HZ=300/CONFIG_HZ=500/' ./.config + echo "CONFIG_HZ_500=y" >> ./.config + echo "CONFIG_HZ_500_NODEF=y" >> ./.config + echo "# CONFIG_HZ_250_NODEF is not set" >> ./.config + echo "# CONFIG_HZ_300_NODEF is not set" >> ./.config + fi + + # default cpu gov + if [ "$_default_cpu_gov" == "performance" ]; then + sed -i -e 's/CONFIG_CPU_FREQ_DEFAULT_GOV_SCHEDUTIL=y/# CONFIG_CPU_FREQ_DEFAULT_GOV_SCHEDUTIL is not set/' ./.config + sed -i -e 's/# CONFIG_CPU_FREQ_DEFAULT_GOV_PERFORMANCE is not set/CONFIG_CPU_FREQ_DEFAULT_GOV_PERFORMANCE=y/' ./.config + elif [ "$_default_cpu_gov" == "ondemand" ]; then + sed -i -e 's/CONFIG_CPU_FREQ_DEFAULT_GOV_SCHEDUTIL=y/# CONFIG_CPU_FREQ_DEFAULT_GOV_SCHEDUTIL is not set/' ./.config + sed -i -e 's/# CONFIG_CPU_FREQ_DEFAULT_GOV_ONDEMAND is not set/CONFIG_CPU_FREQ_DEFAULT_GOV_ONDEMAND=y/' ./.config + fi + + # ACPI_CPUFREQ disablement + if [ "$_disable_acpi_cpufreq" == "true" ]; then + sed -i -e 's/CONFIG_X86_ACPI_CPUFREQ=m/# CONFIG_X86_ACPI_CPUFREQ is not set/' ./.config + fi + + # ftrace + if [ -z "$_ftracedisable" ]; then + plain "" + plain "Disable FUNCTION_TRACER/GRAPH_TRACER? Lowers overhead but limits debugging" + plain "and analyzing of kernel functions." + read -rp "`echo $' > N/y : '`" CONDITION2; + fi + if [ "$CONDITION2" == "y" ] || [ "$_ftracedisable" == "true" ]; then + sed -i -e 's/CONFIG_FUNCTION_TRACER=y/# CONFIG_FUNCTION_TRACER is not set/' ./.config + sed -i -e 's/CONFIG_FUNCTION_GRAPH_TRACER=y/# CONFIG_FUNCTION_GRAPH_TRACER is not set/' ./.config + fi + + # disable numa + if [ -z "$_numadisable" ]; then + plain "" + plain "Disable NUMA? Lowers overhead, but breaks CUDA/NvEnc on Nvidia if disabled." + plain "https://bbs.archlinux.org/viewtopic.php?id=239174" + read -rp "`echo $' > N/y : '`" CONDITION3; + fi + if [ "$CONDITION3" == "y" ] || [ "$_numadisable" == "true" ]; then + # disable NUMA since 99.9% of users do not have multiple CPUs but do have multiple cores in one CPU + sed -i -e 's/CONFIG_NUMA=y/# CONFIG_NUMA is not set/' \ + -i -e '/CONFIG_AMD_NUMA=y/d' \ + -i -e '/CONFIG_X86_64_ACPI_NUMA=y/d' \ + -i -e '/CONFIG_NODES_SPAN_OTHER_NODES=y/d' \ + -i -e '/# CONFIG_NUMA_EMU is not set/d' \ + -i -e '/CONFIG_NODES_SHIFT=6/d' \ + -i -e '/CONFIG_NEED_MULTIPLE_NODES=y/d' \ + -i -e '/CONFIG_USE_PERCPU_NUMA_NODE_ID=y/d' \ + -i -e '/CONFIG_ACPI_NUMA=y/d' ./.config + fi + + # tickless + if [ -z "$_tickless" ]; then + plain "" + plain "Use CattaRappa mode (Tickless/Dynticks) ?" + plain "Can give higher performances in many cases but lower consistency on some hardware." + plain "Just tickless idle can perform better with some platforms (mostly AMD) or CPU schedulers (mostly MuQSS)." + if [ "${_cpusched}" == "MuQSS" ]; then + read -rp "`echo $'\n 0.No, use periodic ticks\n 1.Yes, full tickless baby!\n > 2.Just tickless idle plz\n [0-2?]: '`" CONDITION4; + else + read -rp "`echo $'\n 0.No, use periodic ticks\n > 1.Yes, full tickless baby!\n 2.Just tickless idle plz\n [0-2?]: '`" CONDITION4; + fi + fi + if [ "$CONDITION4" == "0" ] || [ "$_tickless" == "0" ]; then + echo "# CONFIG_NO_HZ_FULL_NODEF is not set" >> ./.config + sed -i -e 's/# CONFIG_HZ_PERIODIC is not set/CONFIG_HZ_PERIODIC=y/' ./.config + sed -i -e 's/CONFIG_NO_HZ_IDLE=y/# CONFIG_NO_HZ_IDLE is not set/' ./.config + sed -i -e 's/CONFIG_NO_HZ_FULL=y/# CONFIG_NO_HZ_FULL is not set/' ./.config + sed -i -e 's/CONFIG_NO_HZ=y/# CONFIG_NO_HZ is not set/' ./.config + sed -i -e 's/CONFIG_NO_HZ_COMMON=y/# CONFIG_NO_HZ_COMMON is not set/' ./.config + elif [ "$CONDITION4" == "2" ] || [ "$_tickless" == "2" ]; then + echo "# CONFIG_NO_HZ_FULL_NODEF is not set" >> ./.config + sed -i -e 's/CONFIG_HZ_PERIODIC=y/# CONFIG_HZ_PERIODIC is not set/' ./.config + sed -i -e 's/# CONFIG_NO_HZ_IDLE is not set/CONFIG_NO_HZ_IDLE=y/' ./.config + sed -i -e 's/CONFIG_NO_HZ_FULL=y/# CONFIG_NO_HZ_FULL is not set/' ./.config + sed -i -e 's/# CONFIG_NO_HZ is not set/CONFIG_NO_HZ=y/' ./.config + sed -i -e 's/# CONFIG_NO_HZ_COMMON is not set/CONFIG_NO_HZ_COMMON=y/' ./.config + else + if [ "${_cpusched}" == "MuQSS" ]; then + echo "# CONFIG_NO_HZ_FULL_NODEF is not set" >> ./.config + sed -i -e 's/CONFIG_HZ_PERIODIC=y/# CONFIG_HZ_PERIODIC is not set/' ./.config + sed -i -e 's/# CONFIG_NO_HZ_IDLE is not set/CONFIG_NO_HZ_IDLE=y/' ./.config + sed -i -e 's/CONFIG_NO_HZ_FULL=y/# CONFIG_NO_HZ_FULL is not set/' ./.config + sed -i -e 's/# CONFIG_NO_HZ is not set/CONFIG_NO_HZ=y/' ./.config + sed -i -e 's/# CONFIG_NO_HZ_COMMON is not set/CONFIG_NO_HZ_COMMON=y/' ./.config + else + echo "CONFIG_NO_HZ_FULL_NODEF=y" >> ./.config + sed -i -e 's/CONFIG_HZ_PERIODIC=y/# CONFIG_HZ_PERIODIC is not set/' ./.config + sed -i -e 's/CONFIG_NO_HZ_IDLE=y/# CONFIG_NO_HZ_IDLE is not set/' ./.config + sed -i -e 's/# CONFIG_NO_HZ_FULL is not set/CONFIG_NO_HZ_FULL=y/' ./.config + sed -i -e 's/# CONFIG_NO_HZ is not set/CONFIG_NO_HZ=y/' ./.config + sed -i -e 's/# CONFIG_NO_HZ_COMMON is not set/CONFIG_NO_HZ_COMMON=y/' ./.config + echo "CONFIG_CONTEXT_TRACKING=y" >> ./.config + echo "# CONFIG_CONTEXT_TRACKING_FORCE is not set" >> ./.config + fi + fi + + # voluntary preempt + if [ -z "$_voluntary_preempt" ]; then + plain "" + plain "Use explicit preemption points?" + plain "It can improve latency on PDS (at the cost of throughput)" + plain "and improve throughput on other schedulers (at the cost of latency)" + read -rp "`echo $' > N/y : '`" CONDITION5; + fi + if [ "$CONDITION5" == "y" ] || [ "$_voluntary_preempt" == "true" ]; then + sed -i -e 's/CONFIG_PREEMPT=y/# CONFIG_PREEMPT is not set/' ./.config + sed -i -e 's/CONFIG_PREEMPT_LL=y/# CONFIG_PREEMPT_LL is not set/' ./.config + sed -i -e 's/# CONFIG_PREEMPT_VOLUNTARY is not set/CONFIG_PREEMPT_VOLUNTARY=y/' ./.config + fi + + # Open Firmware support + if [ -z "$_OFenable" ]; then + plain "" + plain "Enable Device Tree and Open Firmware support?" + read -rp "`echo $' > N/y : '`" CONDITION6; + fi + if [ "$CONDITION6" == "y" ] || [ "$_OFenable" == "true" ]; then + sed -i -e 's/# CONFIG_OF is not set/CONFIG_OF=y/' ./.config + fi + + # acs override + if [ -z "$_acs_override" ]; then + plain "" + plain "Use ACS override patch?" + plain "https://wiki.archlinux.org/index.php/PCI_passthrough_via_OVMF#Bypassing_the_IOMMU_groups_.28ACS_override_patch.29" + read -rp "`echo $' > N/y : '`" CONDITION7; + fi + if [ "$CONDITION7" == "y" ] || [ "$_acs_override" == "true" ]; then + patch -Np1 -i ../0006-add-acs-overrides_iommu.patch + fi + + # bcachefs +# if [ -z "$_bcachefs" ]; then +# plain "" +# plain "Add Bcache filesystem support? You'll have to install bcachefs-tools-git from AUR for utilities." +# plain "https://bcachefs.org/" +# read -rp "`echo $' > N/y : '`" CONDITION8; +# fi +# if [ "$CONDITION8" == "y" ] || [ "$_bcachefs" == "true" ]; then +# patch -Np1 -i ../0008-5.6-bcachefs.patch +# echo "CONFIG_BCACHEFS_FS=m" >> ./.config +# echo "CONFIG_BCACHEFS_QUOTA=y" >> ./.config +# echo "CONFIG_BCACHEFS_POSIX_ACL=y" >> ./.config +# echo "# CONFIG_BCACHEFS_DEBUG is not set" >> ./.config +# echo "# CONFIG_BCACHEFS_TESTS is not set" >> ./.config +# echo "# CONFIG_DEBUG_CLOSURES is not set" >> ./.config +# fi + + # fsync support + if [ -z "$_fsync" ]; then + plain "" + plain "Enable support for fsync, an experimental replacement for esync in Valve Proton 4.11+" + plain "https://steamcommunity.com/games/221410/announcements/detail/2957094910196249305" + read -rp "`echo $' > N/y : '`" CONDITION9; + fi + if [ "$CONDITION9" == "y" ] || [ "$_fsync" == "true" ]; then + patch -Np1 -i ../0007-v5.6-fsync.patch + fi + + # ZFS fix + if [ -z "$_zfsfix" ]; then + plain "" + plain "Add back missing symbol for AES-NI/AVX support on ZFS" + plain "https://github.com/NixOS/nixpkgs/blob/master/pkgs/os-specific/linux/kernel/export_kernel_fpu_functions_5_3.patch" + read -rp "`echo $' > N/y : '`" CONDITION11; + fi + if [ "$CONDITION11" == "y" ] || [ "$_zfsfix" == "true" ]; then + patch -Np1 -i ../0011-ZFS-fix.patch + fi + + # Community patches + if [ -n "$_community_patches" ]; then + _community_patches=($_community_patches) + for _p in ${_community_patches[@]}; do + ln -s "$_where"/../community-patches/linux56-tkg/$_p "$_where"/ + done + fi + + # userpatches + if [ "$_user_patches" == "true" ]; then + _userpatch_target="linux-${_basekernel}" + _userpatch_ext="my" + user_patcher + fi + + # Community patches removal + for _p in ${_community_patches[@]}; do + rm -f "$_where"/$_p + done + + # don't run depmod on 'make install'. We'll do this ourselves in packaging + sed -i '2iexit 0' scripts/depmod.sh + + # get kernel version + make prepare + + # modprobed-db + if [ -z "$_modprobeddb" ]; then + plain "" + plain "Use modprobed db to clean config from unneeded modules?" + plain "Speeds up compilation considerably. Requires root." + plain "https://wiki.archlinux.org/index.php/Modprobed-db" + plain "!!!! Make sure to have a well populated db !!!!" + read -rp "`echo $' > N/y : '`" CONDITIONMPDB; + fi + if [ "$CONDITIONMPDB" == "y" ] || [ "$_modprobeddb" == "true" ]; then + sudo modprobed-db recall + make localmodconfig + fi + + if [ true = "$_config_fragments" ]; then + local fragments=() + mapfile -d '' -t fragments < <(find "$_where" -type f -name "*.myfrag" -print0) + + if [ true = "$_config_fragments_no_confirm" ]; then + printf 'Using config fragment %s\n' "${fragments[@]#$_where/}" + else + for i in "${!fragments[@]}"; do + while true; do + read -r -p 'Found config fragment '"${fragments[$i]#$_where/}"', apply it? [y/N] ' CONDITIONMPDB + CONDITIONMPDB="$(printf '%s' "$CONDITIONMPDB" | tr '[:upper:]' '[:lower:]')" + case "$CONDITIONMPDB" in + y|yes) + break;; + n|no|'') + unset fragments[$i] + break;; + *) + echo 'Please answer with yes or no' + esac + done + done + fi + + if [ 0 -lt "${#fragments[@]}" ]; then + scripts/kconfig/merge_config.sh -m .config "${fragments[@]}" + fi + fi + + # menuconfig / nconfig + if [ -z "$_menunconfig" ]; then + plain "" + plain "*Optional* For advanced users - Do you want to use make menuconfig or nconfig" + plain "to configure the kernel before building it?" + plain "If you do, make sure your terminal is currently" + plain "at least 19 lines by 80 columns large or you'll get an error :D" + read -rp "`echo $' > 0. nope\n 1. menuconfig\n 2. nconfig\n choice[0-2?]: '`" CONDITIONMNC; + _menunconfig="$CONDITIONMNC" + fi + if [ 1 = "$_menunconfig" ]; then + cp .config .config.orig + make menuconfig + elif [ 2 = "$_menunconfig" ]; then + cp .config .config.orig + make nconfig + else + # rewrite configuration + yes "" | make config >/dev/null + fi + if [ 1 = "$_menunconfig" ] || [ 2 = "$_menunconfig" ]; then + if [ -z "${_diffconfig}" ]; then + while true; do + read -r -p 'Generate a config fragment from your changes? [y/N] ' CONDITIONF + CONDITIONF="$(printf '%s' "$CONDITIONF" | tr '[:upper:]' '[:lower:]')" + case "$CONDITIONF" in + y|yes) + _diffconfig=true + break;; + n|no|'') + _diffconfig=false + break;; + *) + echo 'Please answer with yes or no' + esac + done + fi + if [ true = "$_diffconfig" ]; then + if [ -z "$_diffconfig_name" ]; then + IFS= read -r -p 'Filename for the config fragment [leave empty to not generate fragment]: ' _diffconfig_name + fi + if [ -z "$_diffconfig_name" ]; then + echo 'No file name given, not generating config fragment.' + else ( + prev_pwd="${PWD:-$(pwd)}" + cd "$_where" + "${prev_pwd}/scripts/diffconfig" -m "${prev_pwd}/.config.orig" "${prev_pwd}/.config" > "$_diffconfig_name" + ) fi + fi + rm .config.orig + fi + + make -s kernelrelease > version + msg2 "Prepared %s version %s" "$pkgbase" "$(&1 ) 3>&1 1>&2 2>&3 ) || _runtime=$( time ( make ${_force_all_threads} LOCALVERSION= bzImage modules 2>&1 ) 3>&1 1>&2 2>&3 ) +} + +hackbase() { + pkgdesc="The $pkgdesc kernel and modules" + depends=('coreutils' 'kmod' 'initramfs') + optdepends=('linux-docs: Kernel hackers manual - HTML documentation that comes with the Linux kernel.' + 'crda: to set the correct wireless channels of your country.' + 'modprobed-db: Keeps track of EVERY kernel module that has ever been probed. Useful for make localmodconfig.' + 'nvidia-tkg: NVIDIA drivers for all installed kernels - non-dkms version.' + 'nvidia-dkms-tkg: NVIDIA drivers for all installed kernels - dkms version.' + 'update-grub: Simple wrapper around grub-mkconfig.') + provides=("linux=${pkgver}" "${pkgbase}") + + cd "${srcdir}/linux-${_basekernel}-${_sub}" + + # get kernel version + local _kernver="$( +From: Serge Hallyn +Date: Fri, 31 May 2013 19:12:12 +0100 +Subject: [PATCH] add sysctl to disallow unprivileged CLONE_NEWUSER by default + +Signed-off-by: Serge Hallyn +[bwh: Remove unneeded binary sysctl bits] +Signed-off-by: Daniel Micay +--- + kernel/fork.c | 15 +++++++++++++++ + kernel/sysctl.c | 12 ++++++++++++ + kernel/user_namespace.c | 3 +++ + 3 files changed, 30 insertions(+) + +diff --git a/kernel/fork.c b/kernel/fork.c +index 07cc743698d3668e..4011d68a8ff9305c 100644 +--- a/kernel/fork.c ++++ b/kernel/fork.c +@@ -102,6 +102,11 @@ + + #define CREATE_TRACE_POINTS + #include ++#ifdef CONFIG_USER_NS ++extern int unprivileged_userns_clone; ++#else ++#define unprivileged_userns_clone 0 ++#endif + + /* + * Minimum number of threads to boot the kernel +@@ -1555,6 +1560,10 @@ static __latent_entropy struct task_struct *copy_process( + if ((clone_flags & (CLONE_NEWUSER|CLONE_FS)) == (CLONE_NEWUSER|CLONE_FS)) + return ERR_PTR(-EINVAL); + ++ if ((clone_flags & CLONE_NEWUSER) && !unprivileged_userns_clone) ++ if (!capable(CAP_SYS_ADMIN)) ++ return ERR_PTR(-EPERM); ++ + /* + * Thread groups must share signals as well, and detached threads + * can only be started up within the thread group. +@@ -2348,6 +2357,12 @@ SYSCALL_DEFINE1(unshare, unsigned long, unshare_flags) + if (unshare_flags & CLONE_NEWNS) + unshare_flags |= CLONE_FS; + ++ if ((unshare_flags & CLONE_NEWUSER) && !unprivileged_userns_clone) { ++ err = -EPERM; ++ if (!capable(CAP_SYS_ADMIN)) ++ goto bad_unshare_out; ++ } ++ + err = check_unshare_flags(unshare_flags); + if (err) + goto bad_unshare_out; +diff --git a/kernel/sysctl.c b/kernel/sysctl.c +index b86520ed3fb60fbf..f7dab3760839f1a1 100644 +--- a/kernel/sysctl.c ++++ b/kernel/sysctl.c +@@ -105,6 +105,9 @@ extern int core_uses_pid; + extern char core_pattern[]; + extern unsigned int core_pipe_limit; + #endif ++#ifdef CONFIG_USER_NS ++extern int unprivileged_userns_clone; ++#endif + extern int pid_max; + extern int pid_max_min, pid_max_max; + extern int percpu_pagelist_fraction; +@@ -513,6 +516,15 @@ static struct ctl_table kern_table[] = { + .proc_handler = proc_dointvec, + }, + #endif ++#ifdef CONFIG_USER_NS ++ { ++ .procname = "unprivileged_userns_clone", ++ .data = &unprivileged_userns_clone, ++ .maxlen = sizeof(int), ++ .mode = 0644, ++ .proc_handler = proc_dointvec, ++ }, ++#endif + #ifdef CONFIG_PROC_SYSCTL + { + .procname = "tainted", +diff --git a/kernel/user_namespace.c b/kernel/user_namespace.c +index c490f1e4313b998a..dd03bd39d7bf194d 100644 +--- a/kernel/user_namespace.c ++++ b/kernel/user_namespace.c +@@ -24,6 +24,9 @@ + #include + #include + ++/* sysctl */ ++int unprivileged_userns_clone; ++ + static struct kmem_cache *user_ns_cachep __read_mostly; + static DEFINE_MUTEX(userns_state_mutex); + +-- +2.15.1 + +From b5202296055dd333db4425120d3f93ef4e6a0573 Mon Sep 17 00:00:00 2001 +From: "Jan Alexander Steffens (heftig)" +Date: Thu, 7 Dec 2017 13:50:48 +0100 +Subject: ZEN: Add CONFIG for unprivileged_userns_clone + +This way our default behavior continues to match the vanilla kernel. +--- + init/Kconfig | 16 ++++++++++++++++ + kernel/user_namespace.c | 4 ++++ + 2 files changed, 20 insertions(+) + +diff --git a/init/Kconfig b/init/Kconfig +index 4592bf7997c0..f3df02990aff 100644 +--- a/init/Kconfig ++++ b/init/Kconfig +@@ -1004,6 +1004,22 @@ config USER_NS + + If unsure, say N. + ++config USER_NS_UNPRIVILEGED ++ bool "Allow unprivileged users to create namespaces" ++ default y ++ depends on USER_NS ++ help ++ When disabled, unprivileged users will not be able to create ++ new namespaces. Allowing users to create their own namespaces ++ has been part of several recent local privilege escalation ++ exploits, so if you need user namespaces but are ++ paranoid^Wsecurity-conscious you want to disable this. ++ ++ This setting can be overridden at runtime via the ++ kernel.unprivileged_userns_clone sysctl. ++ ++ If unsure, say Y. ++ + config PID_NS + bool "PID Namespaces" + default y +diff --git a/kernel/user_namespace.c b/kernel/user_namespace.c +index 6b9dbc257e34..107b17f0d528 100644 +--- a/kernel/user_namespace.c ++++ b/kernel/user_namespace.c +@@ -27,7 +27,11 @@ + #include + + /* sysctl */ ++#ifdef CONFIG_USER_NS_UNPRIVILEGED ++int unprivileged_userns_clone = 1; ++#else + int unprivileged_userns_clone; ++#endif + + static struct kmem_cache *user_ns_cachep __read_mostly; + static DEFINE_MUTEX(userns_state_mutex); diff --git a/linux56-rc-tkg/linux56-tkg-patches/0002-clear-patches.patch b/linux56-rc-tkg/linux56-tkg-patches/0002-clear-patches.patch new file mode 100644 index 0000000..a7c9d4a --- /dev/null +++ b/linux56-rc-tkg/linux56-tkg-patches/0002-clear-patches.patch @@ -0,0 +1,354 @@ +From 2ac70785613ef4c6b16414986bb18bd7b60d2a13 Mon Sep 17 00:00:00 2001 +From: Arjan van de Ven +Date: Mon, 14 Mar 2016 11:10:58 -0600 +Subject: [PATCH] pci pme wakeups + +Reduce wakeups for PME checks, which are a workaround for miswired +boards (sadly, too many of them) in laptops. +--- + drivers/pci/pci.c | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +diff --git a/drivers/pci/pci.c b/drivers/pci/pci.c +index c25acace7d91..0ddebdad9f5b 100644 +--- a/drivers/pci/pci.c ++++ b/drivers/pci/pci.c +@@ -61,7 +61,7 @@ struct pci_pme_device { + struct pci_dev *dev; + }; + +-#define PME_TIMEOUT 1000 /* How long between PME checks */ ++#define PME_TIMEOUT 4000 /* How long between PME checks */ + + static void pci_dev_d3_sleep(struct pci_dev *dev) + { +-- +2.20.1 + +From 7e7e36c67aa71d6a1ec5676d99d37c1fea389ceb Mon Sep 17 00:00:00 2001 +From: Arjan van de Ven +Date: Sat, 19 Mar 2016 21:32:19 -0400 +Subject: [PATCH] intel_idle: tweak cpuidle cstates + +Increase target_residency in cpuidle cstate + +Tune intel_idle to be a bit less agressive; +Clear linux is cleaner in hygiene (wakupes) than the average linux, +so we can afford changing these in a way that increases +performance while keeping power efficiency +--- + drivers/idle/intel_idle.c | 44 +++++++++++++++++++-------------------- + 1 file changed, 22 insertions(+), 22 deletions(-) + +diff --git a/drivers/idle/intel_idle.c b/drivers/idle/intel_idle.c +index 8b5d85c91e9d..5e2d813a048d 100644 +--- a/drivers/idle/intel_idle.c ++++ b/drivers/idle/intel_idle.c +@@ -466,7 +466,7 @@ static struct cpuidle_state hsw_cstates[] = { + .desc = "MWAIT 0x01", + .flags = MWAIT2flg(0x01), + .exit_latency = 10, +- .target_residency = 20, ++ .target_residency = 120, + .enter = &intel_idle, + .enter_s2idle = intel_idle_s2idle, }, + { +@@ -474,7 +474,7 @@ static struct cpuidle_state hsw_cstates[] = { + .desc = "MWAIT 0x10", + .flags = MWAIT2flg(0x10) | CPUIDLE_FLAG_TLB_FLUSHED, + .exit_latency = 33, +- .target_residency = 100, ++ .target_residency = 900, + .enter = &intel_idle, + .enter_s2idle = intel_idle_s2idle, }, + { +@@ -482,7 +482,7 @@ static struct cpuidle_state hsw_cstates[] = { + .desc = "MWAIT 0x20", + .flags = MWAIT2flg(0x20) | CPUIDLE_FLAG_TLB_FLUSHED, + .exit_latency = 133, +- .target_residency = 400, ++ .target_residency = 1000, + .enter = &intel_idle, + .enter_s2idle = intel_idle_s2idle, }, + { +@@ -490,7 +490,7 @@ static struct cpuidle_state hsw_cstates[] = { + .desc = "MWAIT 0x32", + .flags = MWAIT2flg(0x32) | CPUIDLE_FLAG_TLB_FLUSHED, + .exit_latency = 166, +- .target_residency = 500, ++ .target_residency = 1500, + .enter = &intel_idle, + .enter_s2idle = intel_idle_s2idle, }, + { +@@ -498,7 +498,7 @@ static struct cpuidle_state hsw_cstates[] = { + .desc = "MWAIT 0x40", + .flags = MWAIT2flg(0x40) | CPUIDLE_FLAG_TLB_FLUSHED, + .exit_latency = 300, +- .target_residency = 900, ++ .target_residency = 2000, + .enter = &intel_idle, + .enter_s2idle = intel_idle_s2idle, }, + { +@@ -506,7 +506,7 @@ static struct cpuidle_state hsw_cstates[] = { + .desc = "MWAIT 0x50", + .flags = MWAIT2flg(0x50) | CPUIDLE_FLAG_TLB_FLUSHED, + .exit_latency = 600, +- .target_residency = 1800, ++ .target_residency = 5000, + .enter = &intel_idle, + .enter_s2idle = intel_idle_s2idle, }, + { +@@ -514,7 +514,7 @@ static struct cpuidle_state hsw_cstates[] = { + .desc = "MWAIT 0x60", + .flags = MWAIT2flg(0x60) | CPUIDLE_FLAG_TLB_FLUSHED, + .exit_latency = 2600, +- .target_residency = 7700, ++ .target_residency = 9000, + .enter = &intel_idle, + .enter_s2idle = intel_idle_s2idle, }, + { +@@ -534,7 +534,7 @@ static struct cpuidle_state bdw_cstates[] = { + .desc = "MWAIT 0x01", + .flags = MWAIT2flg(0x01), + .exit_latency = 10, +- .target_residency = 20, ++ .target_residency = 120, + .enter = &intel_idle, + .enter_s2idle = intel_idle_s2idle, }, + { +@@ -542,7 +542,7 @@ static struct cpuidle_state bdw_cstates[] = { + .desc = "MWAIT 0x10", + .flags = MWAIT2flg(0x10) | CPUIDLE_FLAG_TLB_FLUSHED, + .exit_latency = 40, +- .target_residency = 100, ++ .target_residency = 1000, + .enter = &intel_idle, + .enter_s2idle = intel_idle_s2idle, }, + { +@@ -550,7 +550,7 @@ static struct cpuidle_state bdw_cstates[] = { + .desc = "MWAIT 0x20", + .flags = MWAIT2flg(0x20) | CPUIDLE_FLAG_TLB_FLUSHED, + .exit_latency = 133, +- .target_residency = 400, ++ .target_residency = 1000, + .enter = &intel_idle, + .enter_s2idle = intel_idle_s2idle, }, + { +@@ -558,7 +558,7 @@ static struct cpuidle_state bdw_cstates[] = { + .desc = "MWAIT 0x32", + .flags = MWAIT2flg(0x32) | CPUIDLE_FLAG_TLB_FLUSHED, + .exit_latency = 166, +- .target_residency = 500, ++ .target_residency = 2000, + .enter = &intel_idle, + .enter_s2idle = intel_idle_s2idle, }, + { +@@ -566,7 +566,7 @@ static struct cpuidle_state bdw_cstates[] = { + .desc = "MWAIT 0x40", + .flags = MWAIT2flg(0x40) | CPUIDLE_FLAG_TLB_FLUSHED, + .exit_latency = 300, +- .target_residency = 900, ++ .target_residency = 4000, + .enter = &intel_idle, + .enter_s2idle = intel_idle_s2idle, }, + { +@@ -574,7 +574,7 @@ static struct cpuidle_state bdw_cstates[] = { + .desc = "MWAIT 0x50", + .flags = MWAIT2flg(0x50) | CPUIDLE_FLAG_TLB_FLUSHED, + .exit_latency = 600, +- .target_residency = 1800, ++ .target_residency = 7000, + .enter = &intel_idle, + .enter_s2idle = intel_idle_s2idle, }, + { +@@ -582,7 +582,7 @@ static struct cpuidle_state bdw_cstates[] = { + .desc = "MWAIT 0x60", + .flags = MWAIT2flg(0x60) | CPUIDLE_FLAG_TLB_FLUSHED, + .exit_latency = 2600, +- .target_residency = 7700, ++ .target_residency = 9000, + .enter = &intel_idle, + .enter_s2idle = intel_idle_s2idle, }, + { +@@ -603,7 +603,7 @@ static struct cpuidle_state skl_cstates[] = { + .desc = "MWAIT 0x01", + .flags = MWAIT2flg(0x01), + .exit_latency = 10, +- .target_residency = 20, ++ .target_residency = 120, + .enter = &intel_idle, + .enter_s2idle = intel_idle_s2idle, }, + { +@@ -611,7 +611,7 @@ static struct cpuidle_state skl_cstates[] = { + .desc = "MWAIT 0x10", + .flags = MWAIT2flg(0x10) | CPUIDLE_FLAG_TLB_FLUSHED, + .exit_latency = 70, +- .target_residency = 100, ++ .target_residency = 1000, + .enter = &intel_idle, + .enter_s2idle = intel_idle_s2idle, }, + { +@@ -619,7 +619,7 @@ static struct cpuidle_state skl_cstates[] = { + .desc = "MWAIT 0x20", + .flags = MWAIT2flg(0x20) | CPUIDLE_FLAG_TLB_FLUSHED, + .exit_latency = 85, +- .target_residency = 200, ++ .target_residency = 600, + .enter = &intel_idle, + .enter_s2idle = intel_idle_s2idle, }, + { +@@ -627,7 +627,7 @@ static struct cpuidle_state skl_cstates[] = { + .desc = "MWAIT 0x33", + .flags = MWAIT2flg(0x33) | CPUIDLE_FLAG_TLB_FLUSHED, + .exit_latency = 124, +- .target_residency = 800, ++ .target_residency = 3000, + .enter = &intel_idle, + .enter_s2idle = intel_idle_s2idle, }, + { +@@ -635,7 +635,7 @@ static struct cpuidle_state skl_cstates[] = { + .desc = "MWAIT 0x40", + .flags = MWAIT2flg(0x40) | CPUIDLE_FLAG_TLB_FLUSHED, + .exit_latency = 200, +- .target_residency = 800, ++ .target_residency = 3200, + .enter = &intel_idle, + .enter_s2idle = intel_idle_s2idle, }, + { +@@ -643,7 +643,7 @@ static struct cpuidle_state skl_cstates[] = { + .desc = "MWAIT 0x50", + .flags = MWAIT2flg(0x50) | CPUIDLE_FLAG_TLB_FLUSHED, + .exit_latency = 480, +- .target_residency = 5000, ++ .target_residency = 9000, + .enter = &intel_idle, + .enter_s2idle = intel_idle_s2idle, }, + { +@@ -651,7 +651,7 @@ static struct cpuidle_state skl_cstates[] = { + .desc = "MWAIT 0x60", + .flags = MWAIT2flg(0x60) | CPUIDLE_FLAG_TLB_FLUSHED, + .exit_latency = 890, +- .target_residency = 5000, ++ .target_residency = 9000, + .enter = &intel_idle, + .enter_s2idle = intel_idle_s2idle, }, + { +@@ -672,7 +672,7 @@ static struct cpuidle_state skx_cstates[] = { + .desc = "MWAIT 0x01", + .flags = MWAIT2flg(0x01), + .exit_latency = 10, +- .target_residency = 20, ++ .target_residency = 300, + .enter = &intel_idle, + .enter_s2idle = intel_idle_s2idle, }, + { +-- +2.20.1 + +From b8211d4f79dd88dfc2d4bd52be46103ea0b70e3e Mon Sep 17 00:00:00 2001 +From: Arjan van de Ven +Date: Fri, 6 Jan 2017 15:34:09 +0000 +Subject: [PATCH] ipv4/tcp: allow the memory tuning for tcp to go a little + bigger than default + +--- + net/ipv4/tcp.c | 4 ++-- + 1 file changed, 2 insertions(+), 2 deletions(-) + +diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c +index cf3c5095c10e..b30d51837b2d 100644 +--- a/net/ipv4/tcp.c ++++ b/net/ipv4/tcp.c +@@ -3897,8 +3897,8 @@ void __init tcp_init(void) + tcp_init_mem(); + /* Set per-socket limits to no more than 1/128 the pressure threshold */ + limit = nr_free_buffer_pages() << (PAGE_SHIFT - 7); +- max_wshare = min(4UL*1024*1024, limit); +- max_rshare = min(6UL*1024*1024, limit); ++ max_wshare = min(16UL*1024*1024, limit); ++ max_rshare = min(16UL*1024*1024, limit); + + init_net.ipv4.sysctl_tcp_wmem[0] = SK_MEM_QUANTUM; + init_net.ipv4.sysctl_tcp_wmem[1] = 16*1024; +-- +2.20.1 + +From 050223869257b87e22636158a80da38d877248ed Mon Sep 17 00:00:00 2001 +From: Arjan van de Ven +Date: Sun, 18 Feb 2018 23:35:41 +0000 +Subject: [PATCH] locking: rwsem: spin faster + +tweak rwsem owner spinning a bit +--- + kernel/locking/rwsem.c | 4 +++- + 1 file changed, 3 insertions(+), 1 deletion(-) + +diff --git a/kernel/locking/rwsem.c b/kernel/locking/rwsem.c +index eef04551eae7..1ec5ab4c8ff7 100644 +--- a/kernel/locking/rwsem.c ++++ b/kernel/locking/rwsem.c +@@ -720,6 +720,7 @@ rwsem_spin_on_owner(struct rw_semaphore *sem, unsigned long nonspinnable) + struct task_struct *new, *owner; + unsigned long flags, new_flags; + enum owner_state state; ++ int i = 0; + + owner = rwsem_owner_flags(sem, &flags); + state = rwsem_owner_state(owner, flags, nonspinnable); +@@ -753,7 +754,8 @@ rwsem_spin_on_owner(struct rw_semaphore *sem, unsigned long nonspinnable) + break; + } + +- cpu_relax(); ++ if (i++ > 1000) ++ cpu_relax(); + } + rcu_read_unlock(); + +From b836ea320114643d4354b43acb6ec8bb06ada487 Mon Sep 17 00:00:00 2001 +From: Arjan van de Ven +Date: Thu, 2 Jun 2016 23:36:32 -0500 +Subject: [PATCH] drivers: Initialize ata before graphics + +ATA init is the long pole in the boot process, and its asynchronous. +move the graphics init after it so that ata and graphics initialize +in parallel +--- + drivers/Makefile | 15 ++++++++------- + 1 file changed, 8 insertions(+), 7 deletions(-) + +diff --git a/drivers/Makefile b/drivers/Makefile +index aaef17cc6512..d08f3a394929 100644 +--- a/drivers/Makefile ++++ b/drivers/Makefile +@@ -58,15 +58,8 @@ obj-y += char/ + # iommu/ comes before gpu as gpu are using iommu controllers + obj-y += iommu/ + +-# gpu/ comes after char for AGP vs DRM startup and after iommu +-obj-y += gpu/ +- + obj-$(CONFIG_CONNECTOR) += connector/ + +-# i810fb and intelfb depend on char/agp/ +-obj-$(CONFIG_FB_I810) += video/fbdev/i810/ +-obj-$(CONFIG_FB_INTEL) += video/fbdev/intelfb/ +- + obj-$(CONFIG_PARPORT) += parport/ + obj-$(CONFIG_NVM) += lightnvm/ + obj-y += base/ block/ misc/ mfd/ nfc/ +@@ -79,6 +72,14 @@ obj-$(CONFIG_IDE) += ide/ + obj-y += scsi/ + obj-y += nvme/ + obj-$(CONFIG_ATA) += ata/ ++ ++# gpu/ comes after char for AGP vs DRM startup and after iommu ++obj-y += gpu/ ++ ++# i810fb and intelfb depend on char/agp/ ++obj-$(CONFIG_FB_I810) += video/fbdev/i810/ ++obj-$(CONFIG_FB_INTEL) += video/fbdev/intelfb/ ++ + obj-$(CONFIG_TARGET_CORE) += target/ + obj-$(CONFIG_MTD) += mtd/ + obj-$(CONFIG_SPI) += spi/ diff --git a/linux56-rc-tkg/linux56-tkg-patches/0003-glitched-base.patch b/linux56-rc-tkg/linux56-tkg-patches/0003-glitched-base.patch new file mode 100644 index 0000000..7934ead --- /dev/null +++ b/linux56-rc-tkg/linux56-tkg-patches/0003-glitched-base.patch @@ -0,0 +1,1459 @@ +From f7f49141a5dbe9c99d78196b58c44307fb2e6be3 Mon Sep 17 00:00:00 2001 +From: Tk-Glitch +Date: Wed, 4 Jul 2018 04:30:08 +0200 +Subject: glitched + +diff --git a/scripts/mkcompile_h b/scripts/mkcompile_h +index 87f1fc9..b3be470 100755 +--- a/scripts/mkcompile_h ++++ b/scripts/mkcompile_h +@@ -50,8 +50,8 @@ else + fi + + UTS_VERSION="#$VERSION" +-CONFIG_FLAGS="" +-if [ -n "$SMP" ] ; then CONFIG_FLAGS="SMP"; fi ++CONFIG_FLAGS="TKG" ++if [ -n "$SMP" ] ; then CONFIG_FLAGS="$CONFIG_FLAGS SMP"; fi + if [ -n "$PREEMPT" ] ; then CONFIG_FLAGS="$CONFIG_FLAGS PREEMPT"; fi + UTS_VERSION="$UTS_VERSION $CONFIG_FLAGS $TIMESTAMP" + +diff --git a/arch/x86/Kconfig.cpu b/arch/x86/Kconfig.cpu +index af9c967782f6..bf07a8c0f495 100644 +--- a/arch/x86/Kconfig.cpu ++++ b/arch/x86/Kconfig.cpu +@@ -123,6 +123,7 @@ config MPENTIUMM + config MPENTIUM4 + bool "Pentium-4/Celeron(P4-based)/Pentium-4 M/older Xeon" + depends on X86_32 ++ select X86_P6_NOP + ---help--- + Select this for Intel Pentium 4 chips. This includes the + Pentium 4, Pentium D, P4-based Celeron and Xeon, and +@@ -155,9 +156,8 @@ config MPENTIUM4 + -Paxville + -Dempsey + +- + config MK6 +- bool "K6/K6-II/K6-III" ++ bool "AMD K6/K6-II/K6-III" + depends on X86_32 + ---help--- + Select this for an AMD K6-family processor. Enables use of +@@ -165,7 +165,7 @@ config MK6 + flags to GCC. + + config MK7 +- bool "Athlon/Duron/K7" ++ bool "AMD Athlon/Duron/K7" + depends on X86_32 + ---help--- + Select this for an AMD Athlon K7-family processor. Enables use of +@@ -173,12 +173,90 @@ config MK7 + flags to GCC. + + config MK8 +- bool "Opteron/Athlon64/Hammer/K8" ++ bool "AMD Opteron/Athlon64/Hammer/K8" + ---help--- + Select this for an AMD Opteron or Athlon64 Hammer-family processor. + Enables use of some extended instructions, and passes appropriate + optimization flags to GCC. + ++config MK8SSE3 ++ bool "AMD Opteron/Athlon64/Hammer/K8 with SSE3" ++ ---help--- ++ Select this for improved AMD Opteron or Athlon64 Hammer-family processors. ++ Enables use of some extended instructions, and passes appropriate ++ optimization flags to GCC. ++ ++config MK10 ++ bool "AMD 61xx/7x50/PhenomX3/X4/II/K10" ++ ---help--- ++ Select this for an AMD 61xx Eight-Core Magny-Cours, Athlon X2 7x50, ++ Phenom X3/X4/II, Athlon II X2/X3/X4, or Turion II-family processor. ++ Enables use of some extended instructions, and passes appropriate ++ optimization flags to GCC. ++ ++config MBARCELONA ++ bool "AMD Barcelona" ++ ---help--- ++ Select this for AMD Family 10h Barcelona processors. ++ ++ Enables -march=barcelona ++ ++config MBOBCAT ++ bool "AMD Bobcat" ++ ---help--- ++ Select this for AMD Family 14h Bobcat processors. ++ ++ Enables -march=btver1 ++ ++config MJAGUAR ++ bool "AMD Jaguar" ++ ---help--- ++ Select this for AMD Family 16h Jaguar processors. ++ ++ Enables -march=btver2 ++ ++config MBULLDOZER ++ bool "AMD Bulldozer" ++ ---help--- ++ Select this for AMD Family 15h Bulldozer processors. ++ ++ Enables -march=bdver1 ++ ++config MPILEDRIVER ++ bool "AMD Piledriver" ++ ---help--- ++ Select this for AMD Family 15h Piledriver processors. ++ ++ Enables -march=bdver2 ++ ++config MSTEAMROLLER ++ bool "AMD Steamroller" ++ ---help--- ++ Select this for AMD Family 15h Steamroller processors. ++ ++ Enables -march=bdver3 ++ ++config MEXCAVATOR ++ bool "AMD Excavator" ++ ---help--- ++ Select this for AMD Family 15h Excavator processors. ++ ++ Enables -march=bdver4 ++ ++config MZEN ++ bool "AMD Zen" ++ ---help--- ++ Select this for AMD Family 17h Zen processors. ++ ++ Enables -march=znver1 ++ ++config MZEN2 ++ bool "AMD Zen 2" ++ ---help--- ++ Select this for AMD Family 17h Zen 2 processors. ++ ++ Enables -march=znver2 ++ + config MCRUSOE + bool "Crusoe" + depends on X86_32 +@@ -260,6 +338,7 @@ config MVIAC7 + + config MPSC + bool "Intel P4 / older Netburst based Xeon" ++ select X86_P6_NOP + depends on X86_64 + ---help--- + Optimize for Intel Pentium 4, Pentium D and older Nocona/Dempsey +@@ -269,8 +348,19 @@ config MPSC + using the cpu family field + in /proc/cpuinfo. Family 15 is an older Xeon, Family 6 a newer one. + ++config MATOM ++ bool "Intel Atom" ++ select X86_P6_NOP ++ ---help--- ++ ++ Select this for the Intel Atom platform. Intel Atom CPUs have an ++ in-order pipelining architecture and thus can benefit from ++ accordingly optimized code. Use a recent GCC with specific Atom ++ support in order to fully benefit from selecting this option. ++ + config MCORE2 +- bool "Core 2/newer Xeon" ++ bool "Intel Core 2" ++ select X86_P6_NOP + ---help--- + + Select this for Intel Core 2 and newer Core 2 Xeons (Xeon 51xx and +@@ -278,14 +368,133 @@ config MCORE2 + family in /proc/cpuinfo. Newer ones have 6 and older ones 15 + (not a typo) + +-config MATOM +- bool "Intel Atom" ++ Enables -march=core2 ++ ++config MNEHALEM ++ bool "Intel Nehalem" ++ select X86_P6_NOP + ---help--- + +- Select this for the Intel Atom platform. Intel Atom CPUs have an +- in-order pipelining architecture and thus can benefit from +- accordingly optimized code. Use a recent GCC with specific Atom +- support in order to fully benefit from selecting this option. ++ Select this for 1st Gen Core processors in the Nehalem family. ++ ++ Enables -march=nehalem ++ ++config MWESTMERE ++ bool "Intel Westmere" ++ select X86_P6_NOP ++ ---help--- ++ ++ Select this for the Intel Westmere formerly Nehalem-C family. ++ ++ Enables -march=westmere ++ ++config MSILVERMONT ++ bool "Intel Silvermont" ++ select X86_P6_NOP ++ ---help--- ++ ++ Select this for the Intel Silvermont platform. ++ ++ Enables -march=silvermont ++ ++config MGOLDMONT ++ bool "Intel Goldmont" ++ select X86_P6_NOP ++ ---help--- ++ ++ Select this for the Intel Goldmont platform including Apollo Lake and Denverton. ++ ++ Enables -march=goldmont ++ ++config MGOLDMONTPLUS ++ bool "Intel Goldmont Plus" ++ select X86_P6_NOP ++ ---help--- ++ ++ Select this for the Intel Goldmont Plus platform including Gemini Lake. ++ ++ Enables -march=goldmont-plus ++ ++config MSANDYBRIDGE ++ bool "Intel Sandy Bridge" ++ select X86_P6_NOP ++ ---help--- ++ ++ Select this for 2nd Gen Core processors in the Sandy Bridge family. ++ ++ Enables -march=sandybridge ++ ++config MIVYBRIDGE ++ bool "Intel Ivy Bridge" ++ select X86_P6_NOP ++ ---help--- ++ ++ Select this for 3rd Gen Core processors in the Ivy Bridge family. ++ ++ Enables -march=ivybridge ++ ++config MHASWELL ++ bool "Intel Haswell" ++ select X86_P6_NOP ++ ---help--- ++ ++ Select this for 4th Gen Core processors in the Haswell family. ++ ++ Enables -march=haswell ++ ++config MBROADWELL ++ bool "Intel Broadwell" ++ select X86_P6_NOP ++ ---help--- ++ ++ Select this for 5th Gen Core processors in the Broadwell family. ++ ++ Enables -march=broadwell ++ ++config MSKYLAKE ++ bool "Intel Skylake" ++ select X86_P6_NOP ++ ---help--- ++ ++ Select this for 6th Gen Core processors in the Skylake family. ++ ++ Enables -march=skylake ++ ++config MSKYLAKEX ++ bool "Intel Skylake X" ++ select X86_P6_NOP ++ ---help--- ++ ++ Select this for 6th Gen Core processors in the Skylake X family. ++ ++ Enables -march=skylake-avx512 ++ ++config MCANNONLAKE ++ bool "Intel Cannon Lake" ++ select X86_P6_NOP ++ ---help--- ++ ++ Select this for 8th Gen Core processors ++ ++ Enables -march=cannonlake ++ ++config MICELAKE ++ bool "Intel Ice Lake" ++ select X86_P6_NOP ++ ---help--- ++ ++ Select this for 10th Gen Core processors in the Ice Lake family. ++ ++ Enables -march=icelake-client ++ ++config MCASCADELAKE ++ bool "Intel Cascade Lake" ++ select X86_P6_NOP ++ ---help--- ++ ++ Select this for Xeon processors in the Cascade Lake family. ++ ++ Enables -march=cascadelake + + config GENERIC_CPU + bool "Generic-x86-64" +@@ -294,6 +503,19 @@ config GENERIC_CPU + Generic x86-64 CPU. + Run equally well on all x86-64 CPUs. + ++config MNATIVE ++ bool "Native optimizations autodetected by GCC" ++ ---help--- ++ ++ GCC 4.2 and above support -march=native, which automatically detects ++ the optimum settings to use based on your processor. -march=native ++ also detects and applies additional settings beyond -march specific ++ to your CPU, (eg. -msse4). Unless you have a specific reason not to ++ (e.g. distcc cross-compiling), you should probably be using ++ -march=native rather than anything listed below. ++ ++ Enables -march=native ++ + endchoice + + config X86_GENERIC +@@ -318,7 +540,7 @@ config X86_INTERNODE_CACHE_SHIFT + config X86_L1_CACHE_SHIFT + int + default "7" if MPENTIUM4 || MPSC +- default "6" if MK7 || MK8 || MPENTIUMM || MCORE2 || MATOM || MVIAC7 || X86_GENERIC || GENERIC_CPU ++ default "6" if MK7 || MK8 || MK8SSE3 || MK10 || MBARCELONA || MBOBCAT || MBULLDOZER || MPILEDRIVER || MSTEAMROLLER || MEXCAVATOR || MZEN || MZEN2 || MJAGUAR || MPENTIUMM || MCORE2 || MNEHALEM || MWESTMERE || MSILVERMONT || MGOLDMONT || MGOLDMONTPLUS || MSANDYBRIDGE || MIVYBRIDGE || MHASWELL || MBROADWELL || MSKYLAKE || MSKYLAKEX || MCANNONLAKE || MICELAKE || MCASCADELAKE || MNATIVE || MATOM || MVIAC7 || X86_GENERIC || GENERIC_CPU + default "4" if MELAN || M486SX || M486 || MGEODEGX1 + default "5" if MWINCHIP3D || MWINCHIPC6 || MCRUSOE || MEFFICEON || MCYRIXIII || MK6 || MPENTIUMIII || MPENTIUMII || M686 || M586MMX || M586TSC || M586 || MVIAC3_2 || MGEODE_LX + +@@ -336,35 +558,36 @@ config X86_ALIGNMENT_16 + + config X86_INTEL_USERCOPY + def_bool y +- depends on MPENTIUM4 || MPENTIUMM || MPENTIUMIII || MPENTIUMII || M586MMX || X86_GENERIC || MK8 || MK7 || MEFFICEON || MCORE2 ++ depends on MPENTIUM4 || MPENTIUMM || MPENTIUMIII || MPENTIUMII || M586MMX || X86_GENERIC || MK8 || MK8SSE3 || MK7 || MEFFICEON || MCORE2 || MK10 || MBARCELONA || MNEHALEM || MWESTMERE || MSILVERMONT || MGOLDMONT || MGOLDMONTPLUS || MSANDYBRIDGE || MIVYBRIDGE || MHASWELL || MBROADWELL || MSKYLAKE || MSKYLAKEX || MCANNONLAKE || MICELAKE || MCASCADELAKE || MNATIVE + + config X86_USE_PPRO_CHECKSUM + def_bool y +- depends on MWINCHIP3D || MWINCHIPC6 || MCYRIXIII || MK7 || MK6 || MPENTIUM4 || MPENTIUMM || MPENTIUMIII || MPENTIUMII || M686 || MK8 || MVIAC3_2 || MVIAC7 || MEFFICEON || MGEODE_LX || MCORE2 || MATOM ++ depends on MWINCHIP3D || MWINCHIPC6 || MCYRIXIII || MK7 || MK6 || MK10 || MPENTIUM4 || MPENTIUMM || MPENTIUMIII || MPENTIUMII || M686 || MK8 || MK8SSE3 || MVIAC3_2 || MVIAC7 || MEFFICEON || MGEODE_LX || MCORE2 || MNEHALEM || MWESTMERE || MSILVERMONT || MGOLDMONT || MGOLDMONTPLUS || MSANDYBRIDGE || MIVYBRIDGE || MHASWELL || MBROADWELL || MSKYLAKE || MSKYLAKEX || MCANNONLAKE || MICELAKE || MCASCADELAKE || MATOM || MNATIVE + + config X86_USE_3DNOW + def_bool y + depends on (MCYRIXIII || MK7 || MGEODE_LX) && !UML + +-# +-# P6_NOPs are a relatively minor optimization that require a family >= +-# 6 processor, except that it is broken on certain VIA chips. +-# Furthermore, AMD chips prefer a totally different sequence of NOPs +-# (which work on all CPUs). In addition, it looks like Virtual PC +-# does not understand them. +-# +-# As a result, disallow these if we're not compiling for X86_64 (these +-# NOPs do work on all x86-64 capable chips); the list of processors in +-# the right-hand clause are the cores that benefit from this optimization. +-# + config X86_P6_NOP +- def_bool y +- depends on X86_64 +- depends on (MCORE2 || MPENTIUM4 || MPSC) ++ default n ++ bool "Support for P6_NOPs on Intel chips" ++ depends on (MCORE2 || MPENTIUM4 || MPSC || MATOM || MNEHALEM || MWESTMERE || MSILVERMONT || MGOLDMONT || MGOLDMONTPLUS || MSANDYBRIDGE || MIVYBRIDGE || MHASWELL || MBROADWELL || MSKYLAKE || MSKYLAKEX || MCANNONLAKE || MICELAKE || MCASCADELAKE || MNATIVE) ++ ---help--- ++ P6_NOPs are a relatively minor optimization that require a family >= ++ 6 processor, except that it is broken on certain VIA chips. ++ Furthermore, AMD chips prefer a totally different sequence of NOPs ++ (which work on all CPUs). In addition, it looks like Virtual PC ++ does not understand them. ++ ++ As a result, disallow these if we're not compiling for X86_64 (these ++ NOPs do work on all x86-64 capable chips); the list of processors in ++ the right-hand clause are the cores that benefit from this optimization. ++ ++ Say Y if you have Intel CPU newer than Pentium Pro, N otherwise. + + config X86_TSC + def_bool y +- depends on (MWINCHIP3D || MCRUSOE || MEFFICEON || MCYRIXIII || MK7 || MK6 || MPENTIUM4 || MPENTIUMM || MPENTIUMIII || MPENTIUMII || M686 || M586MMX || M586TSC || MK8 || MVIAC3_2 || MVIAC7 || MGEODEGX1 || MGEODE_LX || MCORE2 || MATOM) || X86_64 ++ depends on (MWINCHIP3D || MCRUSOE || MEFFICEON || MCYRIXIII || MK7 || MK6 || MPENTIUM4 || MPENTIUMM || MPENTIUMIII || MPENTIUMII || M686 || M586MMX || M586TSC || MK8 || MK8SSE3 || MVIAC3_2 || MVIAC7 || MGEODEGX1 || MGEODE_LX || MCORE2 || MNEHALEM || MWESTMERE || MSILVERMONT || MGOLDMONT || MGOLDMONTPLUS || MSANDYBRIDGE || MIVYBRIDGE || MHASWELL || MBROADWELL || MSKYLAKE || MSKYLAKEX || MCANNONLAKE || MICELAKE || MCASCADELAKE || MNATIVE || MATOM) || X86_64 + + config X86_CMPXCHG64 + def_bool y +@@ -374,7 +597,7 @@ config X86_CMPXCHG64 + # generates cmov. + config X86_CMOV + def_bool y +- depends on (MK8 || MK7 || MCORE2 || MPENTIUM4 || MPENTIUMM || MPENTIUMIII || MPENTIUMII || M686 || MVIAC3_2 || MVIAC7 || MCRUSOE || MEFFICEON || X86_64 || MATOM || MGEODE_LX) ++ depends on (MK8 || MK8SSE3 || MK10 || MBARCELONA || MBOBCAT || MBULLDOZER || MPILEDRIVER || MSTEAMROLLER || MEXCAVATOR || MZEN || MZEN2 || MJAGUAR || MK7 || MCORE2 || MNEHALEM || MWESTMERE || MSILVERMONT || MGOLDMONT || MGOLDMONTPLUS || MSANDYBRIDGE || MIVYBRIDGE || MHASWELL || MBROADWELL || MSKYLAKE || MSKYLAKEX || MCANNONLAKE || MICELAKE || MCASCADELAKE || MPENTIUM4 || MPENTIUMM || MPENTIUMIII || MPENTIUMII || M686 || MVIAC3_2 || MVIAC7 || MCRUSOE || MEFFICEON || X86_64 || MNATIVE || MATOM || MGEODE_LX) + + config X86_MINIMUM_CPU_FAMILY + int +diff --git a/arch/x86/Makefile b/arch/x86/Makefile +index 94df0868804b..dcbed7e3a070 100644 +--- a/arch/x86/Makefile ++++ b/arch/x86/Makefile +@@ -119,13 +119,53 @@ else + KBUILD_CFLAGS += $(call cc-option,-mskip-rax-setup) + + # FIXME - should be integrated in Makefile.cpu (Makefile_32.cpu) ++ cflags-$(CONFIG_MNATIVE) += $(call cc-option,-march=native) + cflags-$(CONFIG_MK8) += $(call cc-option,-march=k8) ++ cflags-$(CONFIG_MK8SSE3) += $(call cc-option,-march=k8-sse3,-mtune=k8) ++ cflags-$(CONFIG_MK10) += $(call cc-option,-march=amdfam10) ++ cflags-$(CONFIG_MBARCELONA) += $(call cc-option,-march=barcelona) ++ cflags-$(CONFIG_MBOBCAT) += $(call cc-option,-march=btver1) ++ cflags-$(CONFIG_MJAGUAR) += $(call cc-option,-march=btver2) ++ cflags-$(CONFIG_MBULLDOZER) += $(call cc-option,-march=bdver1) ++ cflags-$(CONFIG_MPILEDRIVER) += $(call cc-option,-march=bdver2) ++ cflags-$(CONFIG_MSTEAMROLLER) += $(call cc-option,-march=bdver3) ++ cflags-$(CONFIG_MEXCAVATOR) += $(call cc-option,-march=bdver4) ++ cflags-$(CONFIG_MZEN) += $(call cc-option,-march=znver1) ++ cflags-$(CONFIG_MZEN2) += $(call cc-option,-march=znver2) + cflags-$(CONFIG_MPSC) += $(call cc-option,-march=nocona) + + cflags-$(CONFIG_MCORE2) += \ +- $(call cc-option,-march=core2,$(call cc-option,-mtune=generic)) +- cflags-$(CONFIG_MATOM) += $(call cc-option,-march=atom) \ +- $(call cc-option,-mtune=atom,$(call cc-option,-mtune=generic)) ++ $(call cc-option,-march=core2,$(call cc-option,-mtune=core2)) ++ cflags-$(CONFIG_MNEHALEM) += \ ++ $(call cc-option,-march=nehalem,$(call cc-option,-mtune=nehalem)) ++ cflags-$(CONFIG_MWESTMERE) += \ ++ $(call cc-option,-march=westmere,$(call cc-option,-mtune=westmere)) ++ cflags-$(CONFIG_MSILVERMONT) += \ ++ $(call cc-option,-march=silvermont,$(call cc-option,-mtune=silvermont)) ++ cflags-$(CONFIG_MGOLDMONT) += \ ++ $(call cc-option,-march=goldmont,$(call cc-option,-mtune=goldmont)) ++ cflags-$(CONFIG_MGOLDMONTPLUS) += \ ++ $(call cc-option,-march=goldmont-plus,$(call cc-option,-mtune=goldmont-plus)) ++ cflags-$(CONFIG_MSANDYBRIDGE) += \ ++ $(call cc-option,-march=sandybridge,$(call cc-option,-mtune=sandybridge)) ++ cflags-$(CONFIG_MIVYBRIDGE) += \ ++ $(call cc-option,-march=ivybridge,$(call cc-option,-mtune=ivybridge)) ++ cflags-$(CONFIG_MHASWELL) += \ ++ $(call cc-option,-march=haswell,$(call cc-option,-mtune=haswell)) ++ cflags-$(CONFIG_MBROADWELL) += \ ++ $(call cc-option,-march=broadwell,$(call cc-option,-mtune=broadwell)) ++ cflags-$(CONFIG_MSKYLAKE) += \ ++ $(call cc-option,-march=skylake,$(call cc-option,-mtune=skylake)) ++ cflags-$(CONFIG_MSKYLAKEX) += \ ++ $(call cc-option,-march=skylake-avx512,$(call cc-option,-mtune=skylake-avx512)) ++ cflags-$(CONFIG_MCANNONLAKE) += \ ++ $(call cc-option,-march=cannonlake,$(call cc-option,-mtune=cannonlake)) ++ cflags-$(CONFIG_MICELAKE) += \ ++ $(call cc-option,-march=icelake-client,$(call cc-option,-mtune=icelake-client)) ++ cflags-$(CONFIG_MCASCADELAKE) += \ ++ $(call cc-option,-march=cascadelake,$(call cc-option,-mtune=cascadelake)) ++ cflags-$(CONFIG_MATOM) += $(call cc-option,-march=bonnell) \ ++ $(call cc-option,-mtune=bonnell,$(call cc-option,-mtune=generic)) + cflags-$(CONFIG_GENERIC_CPU) += $(call cc-option,-mtune=generic) + KBUILD_CFLAGS += $(cflags-y) + +diff --git a/arch/x86/Makefile_32.cpu b/arch/x86/Makefile_32.cpu +index cd3056759880..2c81838df533 100644 +--- a/arch/x86/Makefile_32.cpu ++++ b/arch/x86/Makefile_32.cpu +@@ -24,7 +24,19 @@ cflags-$(CONFIG_MK6) += -march=k6 + # Please note, that patches that add -march=athlon-xp and friends are pointless. + # They make zero difference whatsosever to performance at this time. + cflags-$(CONFIG_MK7) += -march=athlon ++cflags-$(CONFIG_MNATIVE) += $(call cc-option,-march=native) + cflags-$(CONFIG_MK8) += $(call cc-option,-march=k8,-march=athlon) ++cflags-$(CONFIG_MK8SSE3) += $(call cc-option,-march=k8-sse3,-march=athlon) ++cflags-$(CONFIG_MK10) += $(call cc-option,-march=amdfam10,-march=athlon) ++cflags-$(CONFIG_MBARCELONA) += $(call cc-option,-march=barcelona,-march=athlon) ++cflags-$(CONFIG_MBOBCAT) += $(call cc-option,-march=btver1,-march=athlon) ++cflags-$(CONFIG_MJAGUAR) += $(call cc-option,-march=btver2,-march=athlon) ++cflags-$(CONFIG_MBULLDOZER) += $(call cc-option,-march=bdver1,-march=athlon) ++cflags-$(CONFIG_MPILEDRIVER) += $(call cc-option,-march=bdver2,-march=athlon) ++cflags-$(CONFIG_MSTEAMROLLER) += $(call cc-option,-march=bdver3,-march=athlon) ++cflags-$(CONFIG_MEXCAVATOR) += $(call cc-option,-march=bdver4,-march=athlon) ++cflags-$(CONFIG_MZEN) += $(call cc-option,-march=znver1,-march=athlon) ++cflags-$(CONFIG_MZEN2) += $(call cc-option,-march=znver2,-march=athlon) + cflags-$(CONFIG_MCRUSOE) += -march=i686 -falign-functions=0 -falign-jumps=0 -falign-loops=0 + cflags-$(CONFIG_MEFFICEON) += -march=i686 $(call tune,pentium3) -falign-functions=0 -falign-jumps=0 -falign-loops=0 + cflags-$(CONFIG_MWINCHIPC6) += $(call cc-option,-march=winchip-c6,-march=i586) +@@ -33,8 +45,22 @@ cflags-$(CONFIG_MCYRIXIII) += $(call cc-option,-march=c3,-march=i486) -falign-fu + cflags-$(CONFIG_MVIAC3_2) += $(call cc-option,-march=c3-2,-march=i686) + cflags-$(CONFIG_MVIAC7) += -march=i686 + cflags-$(CONFIG_MCORE2) += -march=i686 $(call tune,core2) +-cflags-$(CONFIG_MATOM) += $(call cc-option,-march=atom,$(call cc-option,-march=core2,-march=i686)) \ +- $(call cc-option,-mtune=atom,$(call cc-option,-mtune=generic)) ++cflags-$(CONFIG_MNEHALEM) += -march=i686 $(call tune,nehalem) ++cflags-$(CONFIG_MWESTMERE) += -march=i686 $(call tune,westmere) ++cflags-$(CONFIG_MSILVERMONT) += -march=i686 $(call tune,silvermont) ++cflags-$(CONFIG_MGOLDMONT) += -march=i686 $(call tune,goldmont) ++cflags-$(CONFIG_MGOLDMONTPLUS) += -march=i686 $(call tune,goldmont-plus) ++cflags-$(CONFIG_MSANDYBRIDGE) += -march=i686 $(call tune,sandybridge) ++cflags-$(CONFIG_MIVYBRIDGE) += -march=i686 $(call tune,ivybridge) ++cflags-$(CONFIG_MHASWELL) += -march=i686 $(call tune,haswell) ++cflags-$(CONFIG_MBROADWELL) += -march=i686 $(call tune,broadwell) ++cflags-$(CONFIG_MSKYLAKE) += -march=i686 $(call tune,skylake) ++cflags-$(CONFIG_MSKYLAKEX) += -march=i686 $(call tune,skylake-avx512) ++cflags-$(CONFIG_MCANNONLAKE) += -march=i686 $(call tune,cannonlake) ++cflags-$(CONFIG_MICELAKE) += -march=i686 $(call tune,icelake-client) ++cflags-$(CONFIG_MCASCADELAKE) += -march=i686 $(call tune,cascadelake) ++cflags-$(CONFIG_MATOM) += $(call cc-option,-march=bonnell,$(call cc-option,-march=core2,-march=i686)) \ ++ $(call cc-option,-mtune=bonnell,$(call cc-option,-mtune=generic)) + + # AMD Elan support + cflags-$(CONFIG_MELAN) += -march=i486 +diff --git a/arch/x86/include/asm/module.h b/arch/x86/include/asm/module.h +index c215d2762488..a4fddfe3d4fb 100644 +--- a/arch/x86/include/asm/module.h ++++ b/arch/x86/include/asm/module.h +@@ -27,6 +27,36 @@ struct mod_arch_specific { + #define MODULE_PROC_FAMILY "586MMX " + #elif defined CONFIG_MCORE2 + #define MODULE_PROC_FAMILY "CORE2 " ++#elif defined CONFIG_MNATIVE ++#define MODULE_PROC_FAMILY "NATIVE " ++#elif defined CONFIG_MNEHALEM ++#define MODULE_PROC_FAMILY "NEHALEM " ++#elif defined CONFIG_MWESTMERE ++#define MODULE_PROC_FAMILY "WESTMERE " ++#elif defined CONFIG_MSILVERMONT ++#define MODULE_PROC_FAMILY "SILVERMONT " ++#elif defined CONFIG_MGOLDMONT ++#define MODULE_PROC_FAMILY "GOLDMONT " ++#elif defined CONFIG_MGOLDMONTPLUS ++#define MODULE_PROC_FAMILY "GOLDMONTPLUS " ++#elif defined CONFIG_MSANDYBRIDGE ++#define MODULE_PROC_FAMILY "SANDYBRIDGE " ++#elif defined CONFIG_MIVYBRIDGE ++#define MODULE_PROC_FAMILY "IVYBRIDGE " ++#elif defined CONFIG_MHASWELL ++#define MODULE_PROC_FAMILY "HASWELL " ++#elif defined CONFIG_MBROADWELL ++#define MODULE_PROC_FAMILY "BROADWELL " ++#elif defined CONFIG_MSKYLAKE ++#define MODULE_PROC_FAMILY "SKYLAKE " ++#elif defined CONFIG_MSKYLAKEX ++#define MODULE_PROC_FAMILY "SKYLAKEX " ++#elif defined CONFIG_MCANNONLAKE ++#define MODULE_PROC_FAMILY "CANNONLAKE " ++#elif defined CONFIG_MICELAKE ++#define MODULE_PROC_FAMILY "ICELAKE " ++#elif defined CONFIG_MCASCADELAKE ++#define MODULE_PROC_FAMILY "CASCADELAKE " + #elif defined CONFIG_MATOM + #define MODULE_PROC_FAMILY "ATOM " + #elif defined CONFIG_M686 +@@ -45,6 +75,28 @@ struct mod_arch_specific { + #define MODULE_PROC_FAMILY "K7 " + #elif defined CONFIG_MK8 + #define MODULE_PROC_FAMILY "K8 " ++#elif defined CONFIG_MK8SSE3 ++#define MODULE_PROC_FAMILY "K8SSE3 " ++#elif defined CONFIG_MK10 ++#define MODULE_PROC_FAMILY "K10 " ++#elif defined CONFIG_MBARCELONA ++#define MODULE_PROC_FAMILY "BARCELONA " ++#elif defined CONFIG_MBOBCAT ++#define MODULE_PROC_FAMILY "BOBCAT " ++#elif defined CONFIG_MBULLDOZER ++#define MODULE_PROC_FAMILY "BULLDOZER " ++#elif defined CONFIG_MPILEDRIVER ++#define MODULE_PROC_FAMILY "PILEDRIVER " ++#elif defined CONFIG_MSTEAMROLLER ++#define MODULE_PROC_FAMILY "STEAMROLLER " ++#elif defined CONFIG_MJAGUAR ++#define MODULE_PROC_FAMILY "JAGUAR " ++#elif defined CONFIG_MEXCAVATOR ++#define MODULE_PROC_FAMILY "EXCAVATOR " ++#elif defined CONFIG_MZEN ++#define MODULE_PROC_FAMILY "ZEN " ++#elif defined CONFIG_MZEN2 ++#define MODULE_PROC_FAMILY "ZEN2 " + #elif defined CONFIG_MELAN + #define MODULE_PROC_FAMILY "ELAN " + #elif defined CONFIG_MCRUSOE +diff --git a/fs/dcache.c b/fs/dcache.c +index 2acfc69878f5..3f1131431e06 100644 +--- a/fs/dcache.c ++++ b/fs/dcache.c +@@ -69,7 +69,7 @@ + * If no ancestor relationship: + * arbitrary, since it's serialized on rename_lock + */ +-int sysctl_vfs_cache_pressure __read_mostly = 100; ++int sysctl_vfs_cache_pressure __read_mostly = 50; + EXPORT_SYMBOL_GPL(sysctl_vfs_cache_pressure); + + __cacheline_aligned_in_smp DEFINE_SEQLOCK(rename_lock); +diff --git a/kernel/sched/core.c b/kernel/sched/core.c +index 211890edf37e..37121563407d 100644 +--- a/kernel/sched/core.c ++++ b/kernel/sched/core.c +@@ -41,7 +41,7 @@ const_debug unsigned int sysctl_sched_features = + * Number of tasks to iterate in a single balance run. + * Limited because this is done with IRQs disabled. + */ +-const_debug unsigned int sysctl_sched_nr_migrate = 32; ++const_debug unsigned int sysctl_sched_nr_migrate = 128; + + /* + * period over which we average the RT time consumption, measured +@@ -61,9 +61,9 @@ __read_mostly int scheduler_running; + + /* + * part of the period that we allow rt tasks to run in us. +- * default: 0.95s ++ * XanMod default: 0.98s + */ +-int sysctl_sched_rt_runtime = 950000; ++int sysctl_sched_rt_runtime = 980000; + + /* + * __task_rq_lock - lock the rq @p resides on. +diff --git a/mm/zswap.c b/mm/zswap.c +index 61a5c41972db..2674c2806130 100644 +--- a/mm/zswap.c ++++ b/mm/zswap.c +@@ -91,7 +91,7 @@ static struct kernel_param_ops zswap_enabled_param_ops = { + module_param_cb(enabled, &zswap_enabled_param_ops, &zswap_enabled, 0644); + + /* Crypto compressor to use */ +-#define ZSWAP_COMPRESSOR_DEFAULT "lzo" ++#define ZSWAP_COMPRESSOR_DEFAULT "lz4" + static char *zswap_compressor = ZSWAP_COMPRESSOR_DEFAULT; + static int zswap_compressor_param_set(const char *, + const struct kernel_param *); +diff --git a/scripts/setlocalversion b/scripts/setlocalversion +index 71f39410691b..288f9679e883 100755 +--- a/scripts/setlocalversion ++++ b/scripts/setlocalversion +@@ -54,7 +54,7 @@ scm_version() + # If only the short version is requested, don't bother + # running further git commands + if $short; then +- echo "+" ++ # echo "+" + return + fi + # If we are past a tagged commit (like + +From f85ed068b4d0e6c31edce8574a95757a60e58b87 Mon Sep 17 00:00:00 2001 +From: Etienne Juvigny +Date: Mon, 3 Sep 2018 17:36:25 +0200 +Subject: Zenify & stuff + + +diff --git a/Documentation/tp_smapi.txt b/Documentation/tp_smapi.txt +new file mode 100644 +index 000000000000..a249678a8866 +--- /dev/null ++++ b/Documentation/tp_smapi.txt +@@ -0,0 +1,275 @@ ++tp_smapi version 0.42 ++IBM ThinkPad hardware functions driver ++ ++Author: Shem Multinymous ++Project: http://sourceforge.net/projects/tpctl ++Wiki: http://thinkwiki.org/wiki/tp_smapi ++List: linux-thinkpad@linux-thinkpad.org ++ (http://mailman.linux-thinkpad.org/mailman/listinfo/linux-thinkpad) ++ ++Description ++----------- ++ ++ThinkPad laptops include a proprietary interface called SMAPI BIOS ++(System Management Application Program Interface) which provides some ++hardware control functionality that is not accessible by other means. ++ ++This driver exposes some features of the SMAPI BIOS through a sysfs ++interface. It is suitable for newer models, on which SMAPI is invoked ++through IO port writes. Older models use a different SMAPI interface; ++for those, try the "thinkpad" module from the "tpctl" package. ++ ++WARNING: ++This driver uses undocumented features and direct hardware access. ++It thus cannot be guaranteed to work, and may cause arbitrary damage ++(especially on models it wasn't tested on). ++ ++ ++Module parameters ++----------------- ++ ++thinkpad_ec module: ++ force_io=1 lets thinkpad_ec load on some recent ThinkPad models ++ (e.g., T400 and T500) whose BIOS's ACPI DSDT reserves the ports we need. ++tp_smapi module: ++ debug=1 enables verbose dmesg output. ++ ++ ++Usage ++----- ++ ++Control of battery charging thresholds (in percents of current full charge ++capacity): ++ ++# echo 40 > /sys/devices/platform/smapi/BAT0/start_charge_thresh ++# echo 70 > /sys/devices/platform/smapi/BAT0/stop_charge_thresh ++# cat /sys/devices/platform/smapi/BAT0/*_charge_thresh ++ ++ (This is useful since Li-Ion batteries wear out much faster at very ++ high or low charge levels. The driver will also keeps the thresholds ++ across suspend-to-disk with AC disconnected; this isn't done ++ automatically by the hardware.) ++ ++Inhibiting battery charging for 17 minutes (overrides thresholds): ++ ++# echo 17 > /sys/devices/platform/smapi/BAT0/inhibit_charge_minutes ++# echo 0 > /sys/devices/platform/smapi/BAT0/inhibit_charge_minutes # stop ++# cat /sys/devices/platform/smapi/BAT0/inhibit_charge_minutes ++ ++ (This can be used to control which battery is charged when using an ++ Ultrabay battery.) ++ ++Forcing battery discharging even if AC power available: ++ ++# echo 1 > /sys/devices/platform/smapi/BAT0/force_discharge # start discharge ++# echo 0 > /sys/devices/platform/smapi/BAT0/force_discharge # stop discharge ++# cat /sys/devices/platform/smapi/BAT0/force_discharge ++ ++ (When AC is connected, forced discharging will automatically stop ++ when battery is fully depleted -- this is useful for calibration. ++ Also, this attribute can be used to control which battery is discharged ++ when both a system battery and an Ultrabay battery are connected.) ++ ++Misc read-only battery status attributes (see note about HDAPS below): ++ ++/sys/devices/platform/smapi/BAT0/installed # 0 or 1 ++/sys/devices/platform/smapi/BAT0/state # idle/charging/discharging ++/sys/devices/platform/smapi/BAT0/cycle_count # integer counter ++/sys/devices/platform/smapi/BAT0/current_now # instantaneous current ++/sys/devices/platform/smapi/BAT0/current_avg # last minute average ++/sys/devices/platform/smapi/BAT0/power_now # instantaneous power ++/sys/devices/platform/smapi/BAT0/power_avg # last minute average ++/sys/devices/platform/smapi/BAT0/last_full_capacity # in mWh ++/sys/devices/platform/smapi/BAT0/remaining_percent # remaining percent of energy (set by calibration) ++/sys/devices/platform/smapi/BAT0/remaining_percent_error # error range of remaing_percent (not reset by calibration) ++/sys/devices/platform/smapi/BAT0/remaining_running_time # in minutes, by last minute average power ++/sys/devices/platform/smapi/BAT0/remaining_running_time_now # in minutes, by instantenous power ++/sys/devices/platform/smapi/BAT0/remaining_charging_time # in minutes ++/sys/devices/platform/smapi/BAT0/remaining_capacity # in mWh ++/sys/devices/platform/smapi/BAT0/design_capacity # in mWh ++/sys/devices/platform/smapi/BAT0/voltage # in mV ++/sys/devices/platform/smapi/BAT0/design_voltage # in mV ++/sys/devices/platform/smapi/BAT0/charging_max_current # max charging current ++/sys/devices/platform/smapi/BAT0/charging_max_voltage # max charging voltage ++/sys/devices/platform/smapi/BAT0/group{0,1,2,3}_voltage # see below ++/sys/devices/platform/smapi/BAT0/manufacturer # string ++/sys/devices/platform/smapi/BAT0/model # string ++/sys/devices/platform/smapi/BAT0/barcoding # string ++/sys/devices/platform/smapi/BAT0/chemistry # string ++/sys/devices/platform/smapi/BAT0/serial # integer ++/sys/devices/platform/smapi/BAT0/manufacture_date # YYYY-MM-DD ++/sys/devices/platform/smapi/BAT0/first_use_date # YYYY-MM-DD ++/sys/devices/platform/smapi/BAT0/temperature # in milli-Celsius ++/sys/devices/platform/smapi/BAT0/dump # see below ++/sys/devices/platform/smapi/ac_connected # 0 or 1 ++ ++The BAT0/group{0,1,2,3}_voltage attribute refers to the separate cell groups ++in each battery. For example, on the ThinkPad 600, X3x, T4x and R5x models, ++the battery contains 3 cell groups in series, where each group consisting of 2 ++or 3 cells connected in parallel. The voltage of each group is given by these ++attributes, and their sum (roughly) equals the "voltage" attribute. ++(The effective performance of the battery is determined by the weakest group, ++i.e., the one those voltage changes most rapidly during dis/charging.) ++ ++The "BAT0/dump" attribute gives a a hex dump of the raw status data, which ++contains additional data now in the above (if you can figure it out). Some ++unused values are autodetected and replaced by "--": ++ ++In all of the above, replace BAT0 with BAT1 to address the 2nd battery (e.g. ++in the UltraBay). ++ ++ ++Raw SMAPI calls: ++ ++/sys/devices/platform/smapi/smapi_request ++This performs raw SMAPI calls. It uses a bad interface that cannot handle ++multiple simultaneous access. Don't touch it, it's for development only. ++If you did touch it, you would so something like ++# echo '211a 100 0 0' > /sys/devices/platform/smapi/smapi_request ++# cat /sys/devices/platform/smapi/smapi_request ++and notice that in the output "211a 34b b2 0 0 0 'OK'", the "4b" in the 2nd ++value, converted to decimal is 75: the current charge stop threshold. ++ ++ ++Model-specific status ++--------------------- ++ ++Works (at least partially) on the following ThinkPad model: ++* A30 ++* G41 ++* R40, R50p, R51, R52 ++* T23, T40, T40p, T41, T41p, T42, T42p, T43, T43p, T60, T61, T400, T410, T420 (partially) ++* X24, X31, X32, X40, X41, X60, X61, X200, X201, X220 (partially) ++* Z60t, Z61m ++ ++Does not work on: ++* X230 and newer ++* T430 and newer ++* Any ThinkPad Edge ++* Any ThinkPad Yoga ++* Any ThinkPad L series ++* Any ThinkPad P series ++ ++Not all functions are available on all models; for detailed status, see: ++ http://thinkwiki.org/wiki/tp_smapi ++ ++Please report success/failure by e-mail or on the Wiki. ++If you get a "not implemented" or "not supported" message, your laptop ++probably just can't do that (at least not via the SMAPI BIOS). ++For negative reports, follow the bug reporting guidelines below. ++If you send me the necessary technical data (i.e., SMAPI function ++interfaces), I will support additional models. ++ ++ ++Additional HDAPS features ++------------------------- ++ ++The modified hdaps driver has several improvements on the one in mainline ++(beyond resolving the conflict with thinkpad_ec and tp_smapi): ++ ++- Fixes reliability and improves support for recent ThinkPad models ++ (especially *60 and newer). Unlike the mainline driver, the modified hdaps ++ correctly follows the Embedded Controller communication protocol. ++ ++- Extends the "invert" parameter to cover all possible axis orientations. ++ The possible values are as follows. ++ Let X,Y denote the hardware readouts. ++ Let R denote the laptop's roll (tilt left/right). ++ Let P denote the laptop's pitch (tilt forward/backward). ++ invert=0: R= X P= Y (same as mainline) ++ invert=1: R=-X P=-Y (same as mainline) ++ invert=2: R=-X P= Y (new) ++ invert=3: R= X P=-Y (new) ++ invert=4: R= Y P= X (new) ++ invert=5: R=-Y P=-X (new) ++ invert=6: R=-Y P= X (new) ++ invert=7: R= Y P=-X (new) ++ It's probably easiest to just try all 8 possibilities and see which yields ++ correct results (e.g., in the hdaps-gl visualisation). ++ ++- Adds a whitelist which automatically sets the correct axis orientation for ++ some models. If the value for your model is wrong or missing, you can override ++ it using the "invert" parameter. Please also update the tables at ++ http://www.thinkwiki.org/wiki/tp_smapi and ++ http://www.thinkwiki.org/wiki/List_of_DMI_IDs ++ and submit a patch for the whitelist in hdaps.c. ++ ++- Provides new attributes: ++ /sys/devices/platform/hdaps/sampling_rate: ++ This determines the frequency at which the host queries the embedded ++ controller for accelerometer data (and informs the hdaps input devices). ++ Default=50. ++ /sys/devices/platform/hdaps/oversampling_ratio: ++ When set to X, the embedded controller is told to do physical accelerometer ++ measurements at a rate that is X times higher than the rate at which ++ the driver reads those measurements (i.e., X*sampling_rate). This ++ makes the readouts from the embedded controller more fresh, and is also ++ useful for the running average filter (see next). Default=5 ++ /sys/devices/platform/hdaps/running_avg_filter_order: ++ When set to X, reported readouts will be the average of the last X physical ++ accelerometer measurements. Current firmware allows 1<=X<=8. Setting to a ++ high value decreases readout fluctuations. The averaging is handled by the ++ embedded controller, so no CPU resources are used. Higher values make the ++ readouts smoother, since it averages out both sensor noise (good) and abrupt ++ changes (bad). Default=2. ++ ++- Provides a second input device, which publishes the raw accelerometer ++ measurements (without the fuzzing needed for joystick emulation). This input ++ device can be matched by a udev rule such as the following (all on one line): ++ KERNEL=="event[0-9]*", ATTRS{phys}=="hdaps/input1", ++ ATTRS{modalias}=="input:b0019v1014p5054e4801-*", ++ SYMLINK+="input/hdaps/accelerometer-event ++ ++A new version of the hdapsd userspace daemon, which uses the input device ++interface instead of polling sysfs, is available seprately. Using this reduces ++the total interrupts per second generated by hdaps+hdapsd (on tickless kernels) ++to 50, down from a value that fluctuates between 50 and 100. Set the ++sampling_rate sysfs attribute to a lower value to further reduce interrupts, ++at the expense of response latency. ++ ++Licensing note: all my changes to the HDAPS driver are licensed under the ++GPL version 2 or, at your option and to the extent allowed by derivation from ++prior works, any later version. My version of hdaps is derived work from the ++mainline version, which at the time of writing is available only under ++GPL version 2. ++ ++Bug reporting ++------------- ++ ++Mail . Please include: ++* Details about your model, ++* Relevant "dmesg" output. Make sure thinkpad_ec and tp_smapi are loaded with ++ the "debug=1" parameter (e.g., use "make load HDAPS=1 DEBUG=1"). ++* Output of "dmidecode | grep -C5 Product" ++* Does the failed functionality works under Windows? ++ ++ ++More about SMAPI ++---------------- ++ ++For hints about what may be possible via the SMAPI BIOS and how, see: ++ ++* IBM Technical Reference Manual for the ThinkPad 770 ++ (http://www-307.ibm.com/pc/support/site.wss/document.do?lndocid=PFAN-3TUQQD) ++* Exported symbols in PWRMGRIF.DLL or TPPWRW32.DLL (e.g., use "objdump -x"). ++* drivers/char/mwave/smapi.c in the Linux kernel tree.* ++* The "thinkpad" SMAPI module (http://tpctl.sourceforge.net). ++* The SMAPI_* constants in tp_smapi.c. ++ ++Note that in the above Technical Reference and in the "thinkpad" module, ++SMAPI is invoked through a function call to some physical address. However, ++the interface used by tp_smapi and the above mwave drive, and apparently ++required by newer ThinkPad, is different: you set the parameters up in the ++CPU's registers and write to ports 0xB2 (the APM control port) and 0x4F; this ++triggers an SMI (System Management Interrupt), causing the CPU to enter ++SMM (System Management Mode) and run the BIOS firmware; the results are ++returned in the CPU's registers. It is not clear what is the relation between ++the two variants of SMAPI, though the assignment of error codes seems to be ++similar. ++ ++In addition, the embedded controller on ThinkPad laptops has a non-standard ++interface at IO ports 0x1600-0x161F (mapped to LCP channel 3 of the H8S chip). ++The interface provides various system management services (currently known: ++battery information and accelerometer readouts). For more information see the ++thinkpad_ec module and the H8S hardware documentation: ++http://documentation.renesas.com/eng/products/mpumcu/rej09b0300_2140bhm.pdf +diff --git a/init/Kconfig b/init/Kconfig +index b4daad2bac23..c1e59dc04209 100644 +--- a/init/Kconfig ++++ b/init/Kconfig +@@ -1244,7 +1244,6 @@ config CC_OPTIMIZE_FOR_PERFORMANCE + + config CC_OPTIMIZE_FOR_PERFORMANCE_O3 + bool "Optimize more for performance (-O3)" +- depends on ARC + imply CC_DISABLE_WARN_MAYBE_UNINITIALIZED # avoid false positives + help + Choosing this option will pass "-O3" to your compiler to optimize +diff --git a/drivers/infiniband/core/addr.c b/drivers/infiniband/core/addr.c +index 4f32c4062fb6..c0bf039e1b40 100644 +--- a/drivers/infiniband/core/addr.c ++++ b/drivers/infiniband/core/addr.c +@@ -721,6 +721,7 @@ int rdma_addr_find_l2_eth_by_grh(const union ib_gid *sgid, + struct sockaddr _sockaddr; + struct sockaddr_in _sockaddr_in; + struct sockaddr_in6 _sockaddr_in6; ++ struct sockaddr_ib _sockaddr_ib; + } sgid_addr, dgid_addr; + int ret; + +diff --git a/drivers/tty/Kconfig b/drivers/tty/Kconfig +index 0840d27381ea..73aba9a31064 100644 +--- a/drivers/tty/Kconfig ++++ b/drivers/tty/Kconfig +@@ -75,6 +75,19 @@ config VT_CONSOLE_SLEEP + def_bool y + depends on VT_CONSOLE && PM_SLEEP + ++config NR_TTY_DEVICES ++ int "Maximum tty device number" ++ depends on VT ++ range 12 63 ++ default 63 ++ ---help--- ++ This option is used to change the number of tty devices in /dev. ++ The default value is 63. The lowest number you can set is 12, ++ 63 is also the upper limit so we don't overrun the serial ++ consoles. ++ ++ If unsure, say 63. ++ + config HW_CONSOLE + bool + depends on VT && !UML +diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h +index 79226ca8f80f..2a30060e7e1d 100644 +--- a/include/linux/blkdev.h ++++ b/include/linux/blkdev.h +@@ -47,7 +47,11 @@ struct blk_queue_stats; + struct blk_stat_callback; + + #define BLKDEV_MIN_RQ 4 ++#ifdef CONFIG_ZENIFY ++#define BLKDEV_MAX_RQ 512 ++#else + #define BLKDEV_MAX_RQ 128 /* Default maximum */ ++#endif + + /* Must be consistent with blk_mq_poll_stats_bkt() */ + #define BLK_MQ_POLL_STATS_BKTS 16 +diff --git a/include/uapi/linux/vt.h b/include/uapi/linux/vt.h +index e9d39c48520a..3bceead8da40 100644 +--- a/include/uapi/linux/vt.h ++++ b/include/uapi/linux/vt.h +@@ -3,12 +3,25 @@ + #define _UAPI_LINUX_VT_H + + ++/* ++ * We will make this definition solely for the purpose of making packages ++ * such as splashutils build, because they can not understand that ++ * NR_TTY_DEVICES is defined in the kernel configuration. ++ */ ++#ifndef CONFIG_NR_TTY_DEVICES ++#define CONFIG_NR_TTY_DEVICES 63 ++#endif ++ + /* + * These constants are also useful for user-level apps (e.g., VC + * resizing). + */ + #define MIN_NR_CONSOLES 1 /* must be at least 1 */ +-#define MAX_NR_CONSOLES 63 /* serial lines start at 64 */ ++/* ++ * NR_TTY_DEVICES: ++ * Value MUST be at least 12 and must never be higher then 63 ++ */ ++#define MAX_NR_CONSOLES CONFIG_NR_TTY_DEVICES /* serial lines start above this */ + /* Note: the ioctl VT_GETSTATE does not work for + consoles 16 and higher (since it returns a short) */ + +diff --git a/init/Kconfig b/init/Kconfig +index 041f3a022122..5ed70eb1ad3a 100644 +--- a/init/Kconfig ++++ b/init/Kconfig +@@ -45,6 +45,38 @@ config THREAD_INFO_IN_TASK + + menu "General setup" + ++config ZENIFY ++ bool "A selection of patches from Zen/Liquorix kernel and additional tweaks for a better gaming experience" ++ default y ++ help ++ Tunes the kernel for responsiveness at the cost of throughput and power usage. ++ ++ --- Virtual Memory Subsystem --------------------------- ++ ++ Mem dirty before bg writeback..: 10 % -> 20 % ++ Mem dirty before sync writeback: 20 % -> 50 % ++ ++ --- Block Layer ---------------------------------------- ++ ++ Queue depth...............: 128 -> 512 ++ Default MQ scheduler......: mq-deadline -> bfq ++ ++ --- CFS CPU Scheduler ---------------------------------- ++ ++ Scheduling latency.............: 6 -> 3 ms ++ Minimal granularity............: 0.75 -> 0.3 ms ++ Wakeup granularity.............: 1 -> 0.5 ms ++ CPU migration cost.............: 0.5 -> 0.25 ms ++ Bandwidth slice size...........: 5 -> 3 ms ++ Ondemand fine upscaling limit..: 95 % -> 85 % ++ ++ --- MuQSS CPU Scheduler -------------------------------- ++ ++ Scheduling interval............: 6 -> 3 ms ++ ISO task max realtime use......: 70 % -> 25 % ++ Ondemand coarse upscaling limit: 80 % -> 45 % ++ Ondemand fine upscaling limit..: 95 % -> 45 % ++ + config BROKEN + bool + +diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c +index 2f0a0be4d344..bada807c7e59 100644 +--- a/kernel/sched/fair.c ++++ b/kernel/sched/fair.c +@@ -37,8 +37,13 @@ + * + * (default: 6ms * (1 + ilog(ncpus)), units: nanoseconds) + */ ++#ifdef CONFIG_ZENIFY ++unsigned int sysctl_sched_latency = 3000000ULL; ++static unsigned int normalized_sysctl_sched_latency = 3000000ULL; ++#else + unsigned int sysctl_sched_latency = 6000000ULL; + static unsigned int normalized_sysctl_sched_latency = 6000000ULL; ++#endif + + /* + * The initial- and re-scaling of tunables is configurable +@@ -58,13 +63,22 @@ enum sched_tunable_scaling sysctl_sched_tunable_scaling = SCHED_TUNABLESCALING_L + * + * (default: 0.75 msec * (1 + ilog(ncpus)), units: nanoseconds) + */ ++#ifdef CONFIG_ZENIFY ++unsigned int sysctl_sched_min_granularity = 300000ULL; ++static unsigned int normalized_sysctl_sched_min_granularity = 300000ULL; ++#else + unsigned int sysctl_sched_min_granularity = 750000ULL; + static unsigned int normalized_sysctl_sched_min_granularity = 750000ULL; ++#endif + + /* + * This value is kept at sysctl_sched_latency/sysctl_sched_min_granularity + */ ++#ifdef CONFIG_ZENIFY ++static unsigned int sched_nr_latency = 10; ++#else + static unsigned int sched_nr_latency = 8; ++#endif + + /* + * After fork, child runs first. If set to 0 (default) then +@@ -81,10 +95,17 @@ unsigned int sysctl_sched_child_runs_first __read_mostly; + * + * (default: 1 msec * (1 + ilog(ncpus)), units: nanoseconds) + */ ++#ifdef CONFIG_ZENIFY ++unsigned int sysctl_sched_wakeup_granularity = 500000UL; ++static unsigned int normalized_sysctl_sched_wakeup_granularity = 500000UL; ++ ++const_debug unsigned int sysctl_sched_migration_cost = 50000UL; ++#else + unsigned int sysctl_sched_wakeup_granularity = 1000000UL; + static unsigned int normalized_sysctl_sched_wakeup_granularity = 1000000UL; + + const_debug unsigned int sysctl_sched_migration_cost = 500000UL; ++#endif + + #ifdef CONFIG_SMP + /* +@@ -107,8 +128,12 @@ int __weak arch_asym_cpu_priority(int cpu) + * + * (default: 5 msec, units: microseconds) + */ ++#ifdef CONFIG_ZENIFY ++unsigned int sysctl_sched_cfs_bandwidth_slice = 3000UL; ++#else + unsigned int sysctl_sched_cfs_bandwidth_slice = 5000UL; + #endif ++#endif + + /* + * The margin used when comparing utilization with CPU capacity: +diff --git a/mm/page-writeback.c b/mm/page-writeback.c +index 337c6afb3345..9315e358f292 100644 +--- a/mm/page-writeback.c ++++ b/mm/page-writeback.c +@@ -71,7 +71,11 @@ static long ratelimit_pages = 32; + /* + * Start background writeback (via writeback threads) at this percentage + */ ++#ifdef CONFIG_ZENIFY ++int dirty_background_ratio = 20; ++#else + int dirty_background_ratio = 10; ++#endif + + /* + * dirty_background_bytes starts at 0 (disabled) so that it is a function of +@@ -88,7 +92,11 @@ int vm_highmem_is_dirtyable; + /* + * The generator of dirty data starts writeback at this percentage + */ ++#ifdef CONFIG_ZENIFY ++int vm_dirty_ratio = 50; ++#else + int vm_dirty_ratio = 20; ++#endif + + /* + * vm_dirty_bytes starts at 0 (disabled) so that it is a function of +diff --git a/net/ipv4/Kconfig b/net/ipv4/Kconfig +index 80dad301361d..42b7fa7d01f8 100644 +--- a/net/ipv4/Kconfig ++++ b/net/ipv4/Kconfig +@@ -702,6 +702,9 @@ choice + config DEFAULT_VEGAS + bool "Vegas" if TCP_CONG_VEGAS=y + ++ config DEFAULT_YEAH ++ bool "YeAH" if TCP_CONG_YEAH=y ++ + config DEFAULT_VENO + bool "Veno" if TCP_CONG_VENO=y + +@@ -735,6 +738,7 @@ config DEFAULT_TCP_CONG + default "htcp" if DEFAULT_HTCP + default "hybla" if DEFAULT_HYBLA + default "vegas" if DEFAULT_VEGAS ++ default "yeah" if DEFAULT_YEAH + default "westwood" if DEFAULT_WESTWOOD + default "veno" if DEFAULT_VENO + default "reno" if DEFAULT_RENO + +From: Nick Desaulniers +Date: Mon, 24 Dec 2018 13:37:41 +0200 +Subject: include/linux/compiler*.h: define asm_volatile_goto + +asm_volatile_goto should also be defined for other compilers that +support asm goto. + +Fixes commit 815f0dd ("include/linux/compiler*.h: make compiler-*.h +mutually exclusive"). + +Signed-off-by: Nick Desaulniers +Signed-off-by: Miguel Ojeda + +diff --git a/include/linux/compiler_types.h b/include/linux/compiler_types.h +index ba814f1..e77eeb0 100644 +--- a/include/linux/compiler_types.h ++++ b/include/linux/compiler_types.h +@@ -188,6 +188,10 @@ struct ftrace_likely_data { + #define asm_volatile_goto(x...) asm goto(x) + #endif + ++#ifndef asm_volatile_goto ++#define asm_volatile_goto(x...) asm goto(x) ++#endif ++ + /* Are two types/vars the same type (ignoring qualifiers)? */ + #define __same_type(a, b) __builtin_types_compatible_p(typeof(a), typeof(b)) + +From: Andy Lavr +Date: Mon, 24 Dec 2018 14:57:47 +0200 +Subject: avl: Use [defer+madvise] as default khugepaged defrag strategy + +For some reason, the default strategy to respond to THP fault fallbacks +is still just madvise, meaning stall if the program wants transparent +hugepages, but don't trigger a background reclaim / compaction if THP +begins to fail allocations. This creates a snowball affect where we +still use the THP code paths, but we almost always fail once a system +has been active and busy for a while. + +The option "defer" was created for interactive systems where THP can +still improve performance. If we have to fallback to a regular page due +to an allocation failure or anything else, we will trigger a background +reclaim and compaction so future THP attempts succeed and previous +attempts eventually have their smaller pages combined without stalling +running applications. + +We still want madvise to stall applications that explicitely want THP, +so defer+madvise _does_ make a ton of sense. Make it the default for +interactive systems, especially if the kernel maintainer left +transparent hugepages on "always". + +Reasoning and details in the original patch: +https://lwn.net/Articles/711248/ + +Signed-off-by: Andy Lavr + +diff --git a/mm/huge_memory.c b/mm/huge_memory.c +index e84a10b..21d62b7 100644 +--- a/mm/huge_memory.c ++++ b/mm/huge_memory.c +@@ -53,7 +53,11 @@ unsigned long transparent_hugepage_flags __read_mostly = + #ifdef CONFIG_TRANSPARENT_HUGEPAGE_MADVISE + (1< +Date: Mon, 25 Nov 2019 15:13:06 -0300 +Subject: [PATCH] elevator: set default scheduler to bfq for blk-mq + +Signed-off-by: Alexandre Frade +--- + block/elevator.c | 6 +++--- + 1 file changed, 3 insertions(+), 3 deletions(-) + +diff --git a/block/elevator.c b/block/elevator.c +index 076ba7308e65..81f89095aa77 100644 +--- a/block/elevator.c ++++ b/block/elevator.c +@@ -623,15 +623,15 @@ static inline bool elv_support_iosched(struct request_queue *q) + } + + /* +- * For single queue devices, default to using mq-deadline. If we have multiple +- * queues or mq-deadline is not available, default to "none". ++ * For single queue devices, default to using bfq. If we have multiple ++ * queues or bfq is not available, default to "none". + */ + static struct elevator_type *elevator_get_default(struct request_queue *q) + { + if (q->nr_hw_queues != 1) + return NULL; + +- return elevator_get(q, "mq-deadline", false); ++ return elevator_get(q, "bfq", false); + } + + /* +From c3ec05777c46e19a8a26d0fc4ca0c0db8a19de97 Mon Sep 17 00:00:00 2001 +From: Alexandre Frade +Date: Fri, 10 May 2019 16:45:59 -0300 +Subject: [PATCH] block: set rq_affinity = 2 for full multithreading I/O + requests + +Signed-off-by: Alexandre Frade +--- + include/linux/blkdev.h | 3 ++- + 1 file changed, 2 insertions(+), 1 deletion(-) + +diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h +index f3ea78b0c91c..4dbacc6b073b 100644 +--- a/include/linux/blkdev.h ++++ b/include/linux/blkdev.h +@@ -621,7 +621,8 @@ struct request_queue { + #define QUEUE_FLAG_RQ_ALLOC_TIME 27 /* record rq->alloc_time_ns */ + + #define QUEUE_FLAG_MQ_DEFAULT ((1 << QUEUE_FLAG_IO_STAT) | \ +- (1 << QUEUE_FLAG_SAME_COMP)) ++ (1 << QUEUE_FLAG_SAME_COMP) | \ ++ (1 << QUEUE_FLAG_SAME_FORCE)) + + void blk_queue_flag_set(unsigned int flag, struct request_queue *q); + void blk_queue_flag_clear(unsigned int flag, struct request_queue *q); +From 8171d33d0b84a953649863538fdbe4c26c035e4f Mon Sep 17 00:00:00 2001 +From: Alexandre Frade +Date: Fri, 10 May 2019 14:32:50 -0300 +Subject: [PATCH] mm: set 2 megabytes for address_space-level file read-ahead + pages size + +Signed-off-by: Alexandre Frade +--- + include/linux/mm.h | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +diff --git a/include/linux/mm.h b/include/linux/mm.h +index a2adf95b3f9c..e804d9f7583a 100644 +--- a/include/linux/mm.h ++++ b/include/linux/mm.h +@@ -2416,7 +2416,7 @@ int __must_check write_one_page(struct page *page); + void task_dirty_inc(struct task_struct *tsk); + + /* readahead.c */ +-#define VM_READAHEAD_PAGES (SZ_128K / PAGE_SIZE) ++#define VM_READAHEAD_PAGES (SZ_2M / PAGE_SIZE) + + int force_page_cache_readahead(struct address_space *mapping, struct file *filp, + pgoff_t offset, unsigned long nr_to_read); +From de7119e3db9fdb4c704355854a02a7e9fad931d4 Mon Sep 17 00:00:00 2001 +From: Steven Barrett +Date: Wed, 15 Jan 2020 20:43:56 -0600 +Subject: [PATCH] ZEN: intel-pstate: Implement "enable" parameter + +If intel-pstate is compiled into the kernel, it will preempt the loading +of acpi-cpufreq so you can take advantage of hardware p-states without +any friction. + +However, intel-pstate is not completely superior to cpufreq's ondemand +for one reason. There's no concept of an up_threshold property. + +In ondemand, up_threshold essentially reduces the maximum utilization to +compare against, allowing you to hit max frequencies and turbo boost +from a much lower core utilization. + +With intel-pstate, you have the concept of minimum and maximum +performance, but no tunable that lets you define, maximum frequency +means 50% core utilization. For just this oversight, there's reasons +you may want ondemand. + +Lets support setting "enable" in kernel boot parameters. This lets +kernel maintainers include "intel_pstate=disable" statically in the +static boot parameters, but let users of the kernel override this +selection. +--- + Documentation/admin-guide/kernel-parameters.txt | 3 +++ + drivers/cpufreq/intel_pstate.c | 2 ++ + 2 files changed, 5 insertions(+) + +diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt +index ade4e6ec23e03..0b613370d28d8 100644 +--- a/Documentation/admin-guide/kernel-parameters.txt ++++ b/Documentation/admin-guide/kernel-parameters.txt +@@ -1765,6 +1765,9 @@ + disable + Do not enable intel_pstate as the default + scaling driver for the supported processors ++ enable ++ Enable intel_pstate in-case "disable" was passed ++ previously in the kernel boot parameters + passive + Use intel_pstate as a scaling driver, but configure it + to work with generic cpufreq governors (instead of +diff --git a/drivers/cpufreq/intel_pstate.c b/drivers/cpufreq/intel_pstate.c +index d2fa3e9ccd97c..bd10cb02fc0ff 100644 +--- a/drivers/cpufreq/intel_pstate.c ++++ b/drivers/cpufreq/intel_pstate.c +@@ -2826,6 +2826,8 @@ static int __init intel_pstate_setup(char *str) + pr_info("HWP disabled\n"); + no_hwp = 1; + } ++ if (!strcmp(str, "enable")) ++ no_load = 0; + if (!strcmp(str, "force")) + force_load = 1; + if (!strcmp(str, "hwp_only")) diff --git a/linux56-rc-tkg/linux56-tkg-patches/0003-glitched-cfs.patch b/linux56-rc-tkg/linux56-tkg-patches/0003-glitched-cfs.patch new file mode 100644 index 0000000..06b7f02 --- /dev/null +++ b/linux56-rc-tkg/linux56-tkg-patches/0003-glitched-cfs.patch @@ -0,0 +1,72 @@ +diff --git a/kernel/Kconfig.hz b/kernel/Kconfig.hz +index 2a202a846757..1d9c7ed79b11 100644 +--- a/kernel/Kconfig.hz ++++ b/kernel/Kconfig.hz +@@ -4,7 +4,7 @@ + + choice + prompt "Timer frequency" +- default HZ_250 ++ default HZ_500 + help + Allows the configuration of the timer frequency. It is customary + to have the timer interrupt run at 1000 Hz but 100 Hz may be more +@@ -39,6 +39,13 @@ choice + on SMP and NUMA systems and exactly dividing by both PAL and + NTSC frame rates for video and multimedia work. + ++ config HZ_500 ++ bool "500 HZ" ++ help ++ 500 Hz is a balanced timer frequency. Provides fast interactivity ++ on desktops with great smoothness without increasing CPU power ++ consumption and sacrificing the battery life on laptops. ++ + config HZ_1000 + bool "1000 HZ" + help +@@ -52,6 +59,7 @@ config HZ + default 100 if HZ_100 + default 250 if HZ_250 + default 300 if HZ_300 ++ default 500 if HZ_500 + default 1000 if HZ_1000 + + config SCHED_HRTICK + +diff --git a/kernel/Kconfig.hz b/kernel/Kconfig.hz +index 2a202a846757..1d9c7ed79b11 100644 +--- a/kernel/Kconfig.hz ++++ b/kernel/Kconfig.hz +@@ -4,7 +4,7 @@ + + choice + prompt "Timer frequency" +- default HZ_500 ++ default HZ_750 + help + Allows the configuration of the timer frequency. It is customary + to have the timer interrupt run at 1000 Hz but 100 Hz may be more +@@ -46,6 +46,13 @@ choice + on desktops with great smoothness without increasing CPU power + consumption and sacrificing the battery life on laptops. + ++ config HZ_750 ++ bool "750 HZ" ++ help ++ 750 Hz is a good timer frequency for desktops. Provides fast ++ interactivity with great smoothness without sacrificing too ++ much throughput. ++ + config HZ_1000 + bool "1000 HZ" + help +@@ -60,6 +67,7 @@ config HZ + default 250 if HZ_250 + default 300 if HZ_300 + default 500 if HZ_500 ++ default 750 if HZ_750 + default 1000 if HZ_1000 + + config SCHED_HRTICK + diff --git a/linux56-rc-tkg/linux56-tkg-patches/0004-5.6-ck1.patch b/linux56-rc-tkg/linux56-tkg-patches/0004-5.6-ck1.patch new file mode 100644 index 0000000..3cff90e --- /dev/null +++ b/linux56-rc-tkg/linux56-tkg-patches/0004-5.6-ck1.patch @@ -0,0 +1,12561 @@ +diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt +index ade4e6ec23e0..e219f958f055 100644 +--- a/Documentation/admin-guide/kernel-parameters.txt ++++ b/Documentation/admin-guide/kernel-parameters.txt +@@ -4307,6 +4307,14 @@ + Memory area to be used by remote processor image, + managed by CMA. + ++ rqshare= [X86] Select the MuQSS scheduler runqueue sharing type. ++ Format: ++ smt -- Share SMT (hyperthread) sibling runqueues ++ mc -- Share MC (multicore) sibling runqueues ++ smp -- Share SMP runqueues ++ none -- So not share any runqueues ++ Default value is mc ++ + rw [KNL] Mount root device read-write on boot + + S [KNL] Run init in single mode +diff --git a/Documentation/admin-guide/sysctl/kernel.rst b/Documentation/admin-guide/sysctl/kernel.rst +index def074807cee..7f00e3a1276b 100644 +--- a/Documentation/admin-guide/sysctl/kernel.rst ++++ b/Documentation/admin-guide/sysctl/kernel.rst +@@ -46,6 +46,7 @@ show up in /proc/sys/kernel: + - hung_task_check_interval_secs + - hung_task_warnings + - hyperv_record_panic_msg ++- iso_cpu + - kexec_load_disabled + - kptr_restrict + - l2cr [ PPC only ] +@@ -82,6 +83,7 @@ show up in /proc/sys/kernel: + - randomize_va_space + - real-root-dev ==> Documentation/admin-guide/initrd.rst + - reboot-cmd [ SPARC only ] ++- rr_interval + - rtsig-max + - rtsig-nr + - sched_energy_aware +@@ -105,6 +107,7 @@ show up in /proc/sys/kernel: + - unknown_nmi_panic + - watchdog + - watchdog_thresh ++- yield_type + - version + + +@@ -438,6 +441,16 @@ When kptr_restrict is set to (2), kernel pointers printed using + %pK will be replaced with 0's regardless of privileges. + + ++iso_cpu: (MuQSS CPU scheduler only) ++=================================== ++ ++This sets the percentage cpu that the unprivileged SCHED_ISO tasks can ++run effectively at realtime priority, averaged over a rolling five ++seconds over the -whole- system, meaning all cpus. ++ ++Set to 70 (percent) by default. ++ ++ + l2cr: (PPC only) + ================ + +@@ -907,6 +920,20 @@ ROM/Flash boot loader. Maybe to tell it what to do after + rebooting. ??? + + ++rr_interval: (MuQSS CPU scheduler only) ++======================================= ++ ++This is the smallest duration that any cpu process scheduling unit ++will run for. Increasing this value can increase throughput of cpu ++bound tasks substantially but at the expense of increased latencies ++overall. Conversely decreasing it will decrease average and maximum ++latencies but at the expense of throughput. This value is in ++milliseconds and the default value chosen depends on the number of ++cpus available at scheduler initialisation with a minimum of 6. ++ ++Valid values are from 1-1000. ++ ++ + rtsig-max & rtsig-nr: + ===================== + +@@ -1173,3 +1200,13 @@ is 10 seconds. + + The softlockup threshold is (2 * watchdog_thresh). Setting this + tunable to zero will disable lockup detection altogether. ++ ++ ++yield_type: (MuQSS CPU scheduler only) ++====================================== ++ ++This determines what type of yield calls to sched_yield will perform. ++ ++ 0: No yield. ++ 1: Yield only to better priority/deadline tasks. (default) ++ 2: Expire timeslice and recalculate deadline. +diff --git a/Documentation/scheduler/sched-BFS.txt b/Documentation/scheduler/sched-BFS.txt +new file mode 100644 +index 000000000000..c0282002a079 +--- /dev/null ++++ b/Documentation/scheduler/sched-BFS.txt +@@ -0,0 +1,351 @@ ++BFS - The Brain Fuck Scheduler by Con Kolivas. ++ ++Goals. ++ ++The goal of the Brain Fuck Scheduler, referred to as BFS from here on, is to ++completely do away with the complex designs of the past for the cpu process ++scheduler and instead implement one that is very simple in basic design. ++The main focus of BFS is to achieve excellent desktop interactivity and ++responsiveness without heuristics and tuning knobs that are difficult to ++understand, impossible to model and predict the effect of, and when tuned to ++one workload cause massive detriment to another. ++ ++ ++Design summary. ++ ++BFS is best described as a single runqueue, O(n) lookup, earliest effective ++virtual deadline first design, loosely based on EEVDF (earliest eligible virtual ++deadline first) and my previous Staircase Deadline scheduler. Each component ++shall be described in order to understand the significance of, and reasoning for ++it. The codebase when the first stable version was released was approximately ++9000 lines less code than the existing mainline linux kernel scheduler (in ++2.6.31). This does not even take into account the removal of documentation and ++the cgroups code that is not used. ++ ++Design reasoning. ++ ++The single runqueue refers to the queued but not running processes for the ++entire system, regardless of the number of CPUs. The reason for going back to ++a single runqueue design is that once multiple runqueues are introduced, ++per-CPU or otherwise, there will be complex interactions as each runqueue will ++be responsible for the scheduling latency and fairness of the tasks only on its ++own runqueue, and to achieve fairness and low latency across multiple CPUs, any ++advantage in throughput of having CPU local tasks causes other disadvantages. ++This is due to requiring a very complex balancing system to at best achieve some ++semblance of fairness across CPUs and can only maintain relatively low latency ++for tasks bound to the same CPUs, not across them. To increase said fairness ++and latency across CPUs, the advantage of local runqueue locking, which makes ++for better scalability, is lost due to having to grab multiple locks. ++ ++A significant feature of BFS is that all accounting is done purely based on CPU ++used and nowhere is sleep time used in any way to determine entitlement or ++interactivity. Interactivity "estimators" that use some kind of sleep/run ++algorithm are doomed to fail to detect all interactive tasks, and to falsely tag ++tasks that aren't interactive as being so. The reason for this is that it is ++close to impossible to determine that when a task is sleeping, whether it is ++doing it voluntarily, as in a userspace application waiting for input in the ++form of a mouse click or otherwise, or involuntarily, because it is waiting for ++another thread, process, I/O, kernel activity or whatever. Thus, such an ++estimator will introduce corner cases, and more heuristics will be required to ++cope with those corner cases, introducing more corner cases and failed ++interactivity detection and so on. Interactivity in BFS is built into the design ++by virtue of the fact that tasks that are waking up have not used up their quota ++of CPU time, and have earlier effective deadlines, thereby making it very likely ++they will preempt any CPU bound task of equivalent nice level. See below for ++more information on the virtual deadline mechanism. Even if they do not preempt ++a running task, because the rr interval is guaranteed to have a bound upper ++limit on how long a task will wait for, it will be scheduled within a timeframe ++that will not cause visible interface jitter. ++ ++ ++Design details. ++ ++Task insertion. ++ ++BFS inserts tasks into each relevant queue as an O(1) insertion into a double ++linked list. On insertion, *every* running queue is checked to see if the newly ++queued task can run on any idle queue, or preempt the lowest running task on the ++system. This is how the cross-CPU scheduling of BFS achieves significantly lower ++latency per extra CPU the system has. In this case the lookup is, in the worst ++case scenario, O(n) where n is the number of CPUs on the system. ++ ++Data protection. ++ ++BFS has one single lock protecting the process local data of every task in the ++global queue. Thus every insertion, removal and modification of task data in the ++global runqueue needs to grab the global lock. However, once a task is taken by ++a CPU, the CPU has its own local data copy of the running process' accounting ++information which only that CPU accesses and modifies (such as during a ++timer tick) thus allowing the accounting data to be updated lockless. Once a ++CPU has taken a task to run, it removes it from the global queue. Thus the ++global queue only ever has, at most, ++ ++ (number of tasks requesting cpu time) - (number of logical CPUs) + 1 ++ ++tasks in the global queue. This value is relevant for the time taken to look up ++tasks during scheduling. This will increase if many tasks with CPU affinity set ++in their policy to limit which CPUs they're allowed to run on if they outnumber ++the number of CPUs. The +1 is because when rescheduling a task, the CPU's ++currently running task is put back on the queue. Lookup will be described after ++the virtual deadline mechanism is explained. ++ ++Virtual deadline. ++ ++The key to achieving low latency, scheduling fairness, and "nice level" ++distribution in BFS is entirely in the virtual deadline mechanism. The one ++tunable in BFS is the rr_interval, or "round robin interval". This is the ++maximum time two SCHED_OTHER (or SCHED_NORMAL, the common scheduling policy) ++tasks of the same nice level will be running for, or looking at it the other ++way around, the longest duration two tasks of the same nice level will be ++delayed for. When a task requests cpu time, it is given a quota (time_slice) ++equal to the rr_interval and a virtual deadline. The virtual deadline is ++offset from the current time in jiffies by this equation: ++ ++ jiffies + (prio_ratio * rr_interval) ++ ++The prio_ratio is determined as a ratio compared to the baseline of nice -20 ++and increases by 10% per nice level. The deadline is a virtual one only in that ++no guarantee is placed that a task will actually be scheduled by this time, but ++it is used to compare which task should go next. There are three components to ++how a task is next chosen. First is time_slice expiration. If a task runs out ++of its time_slice, it is descheduled, the time_slice is refilled, and the ++deadline reset to that formula above. Second is sleep, where a task no longer ++is requesting CPU for whatever reason. The time_slice and deadline are _not_ ++adjusted in this case and are just carried over for when the task is next ++scheduled. Third is preemption, and that is when a newly waking task is deemed ++higher priority than a currently running task on any cpu by virtue of the fact ++that it has an earlier virtual deadline than the currently running task. The ++earlier deadline is the key to which task is next chosen for the first and ++second cases. Once a task is descheduled, it is put back on the queue, and an ++O(n) lookup of all queued-but-not-running tasks is done to determine which has ++the earliest deadline and that task is chosen to receive CPU next. ++ ++The CPU proportion of different nice tasks works out to be approximately the ++ ++ (prio_ratio difference)^2 ++ ++The reason it is squared is that a task's deadline does not change while it is ++running unless it runs out of time_slice. Thus, even if the time actually ++passes the deadline of another task that is queued, it will not get CPU time ++unless the current running task deschedules, and the time "base" (jiffies) is ++constantly moving. ++ ++Task lookup. ++ ++BFS has 103 priority queues. 100 of these are dedicated to the static priority ++of realtime tasks, and the remaining 3 are, in order of best to worst priority, ++SCHED_ISO (isochronous), SCHED_NORMAL, and SCHED_IDLEPRIO (idle priority ++scheduling). When a task of these priorities is queued, a bitmap of running ++priorities is set showing which of these priorities has tasks waiting for CPU ++time. When a CPU is made to reschedule, the lookup for the next task to get ++CPU time is performed in the following way: ++ ++First the bitmap is checked to see what static priority tasks are queued. If ++any realtime priorities are found, the corresponding queue is checked and the ++first task listed there is taken (provided CPU affinity is suitable) and lookup ++is complete. If the priority corresponds to a SCHED_ISO task, they are also ++taken in FIFO order (as they behave like SCHED_RR). If the priority corresponds ++to either SCHED_NORMAL or SCHED_IDLEPRIO, then the lookup becomes O(n). At this ++stage, every task in the runlist that corresponds to that priority is checked ++to see which has the earliest set deadline, and (provided it has suitable CPU ++affinity) it is taken off the runqueue and given the CPU. If a task has an ++expired deadline, it is taken and the rest of the lookup aborted (as they are ++chosen in FIFO order). ++ ++Thus, the lookup is O(n) in the worst case only, where n is as described ++earlier, as tasks may be chosen before the whole task list is looked over. ++ ++ ++Scalability. ++ ++The major limitations of BFS will be that of scalability, as the separate ++runqueue designs will have less lock contention as the number of CPUs rises. ++However they do not scale linearly even with separate runqueues as multiple ++runqueues will need to be locked concurrently on such designs to be able to ++achieve fair CPU balancing, to try and achieve some sort of nice-level fairness ++across CPUs, and to achieve low enough latency for tasks on a busy CPU when ++other CPUs would be more suited. BFS has the advantage that it requires no ++balancing algorithm whatsoever, as balancing occurs by proxy simply because ++all CPUs draw off the global runqueue, in priority and deadline order. Despite ++the fact that scalability is _not_ the prime concern of BFS, it both shows very ++good scalability to smaller numbers of CPUs and is likely a more scalable design ++at these numbers of CPUs. ++ ++It also has some very low overhead scalability features built into the design ++when it has been deemed their overhead is so marginal that they're worth adding. ++The first is the local copy of the running process' data to the CPU it's running ++on to allow that data to be updated lockless where possible. Then there is ++deference paid to the last CPU a task was running on, by trying that CPU first ++when looking for an idle CPU to use the next time it's scheduled. Finally there ++is the notion of cache locality beyond the last running CPU. The sched_domains ++information is used to determine the relative virtual "cache distance" that ++other CPUs have from the last CPU a task was running on. CPUs with shared ++caches, such as SMT siblings, or multicore CPUs with shared caches, are treated ++as cache local. CPUs without shared caches are treated as not cache local, and ++CPUs on different NUMA nodes are treated as very distant. This "relative cache ++distance" is used by modifying the virtual deadline value when doing lookups. ++Effectively, the deadline is unaltered between "cache local" CPUs, doubled for ++"cache distant" CPUs, and quadrupled for "very distant" CPUs. The reasoning ++behind the doubling of deadlines is as follows. The real cost of migrating a ++task from one CPU to another is entirely dependant on the cache footprint of ++the task, how cache intensive the task is, how long it's been running on that ++CPU to take up the bulk of its cache, how big the CPU cache is, how fast and ++how layered the CPU cache is, how fast a context switch is... and so on. In ++other words, it's close to random in the real world where we do more than just ++one sole workload. The only thing we can be sure of is that it's not free. So ++BFS uses the principle that an idle CPU is a wasted CPU and utilising idle CPUs ++is more important than cache locality, and cache locality only plays a part ++after that. Doubling the effective deadline is based on the premise that the ++"cache local" CPUs will tend to work on the same tasks up to double the number ++of cache local CPUs, and once the workload is beyond that amount, it is likely ++that none of the tasks are cache warm anywhere anyway. The quadrupling for NUMA ++is a value I pulled out of my arse. ++ ++When choosing an idle CPU for a waking task, the cache locality is determined ++according to where the task last ran and then idle CPUs are ranked from best ++to worst to choose the most suitable idle CPU based on cache locality, NUMA ++node locality and hyperthread sibling business. They are chosen in the ++following preference (if idle): ++ ++* Same core, idle or busy cache, idle threads ++* Other core, same cache, idle or busy cache, idle threads. ++* Same node, other CPU, idle cache, idle threads. ++* Same node, other CPU, busy cache, idle threads. ++* Same core, busy threads. ++* Other core, same cache, busy threads. ++* Same node, other CPU, busy threads. ++* Other node, other CPU, idle cache, idle threads. ++* Other node, other CPU, busy cache, idle threads. ++* Other node, other CPU, busy threads. ++ ++This shows the SMT or "hyperthread" awareness in the design as well which will ++choose a real idle core first before a logical SMT sibling which already has ++tasks on the physical CPU. ++ ++Early benchmarking of BFS suggested scalability dropped off at the 16 CPU mark. ++However this benchmarking was performed on an earlier design that was far less ++scalable than the current one so it's hard to know how scalable it is in terms ++of both CPUs (due to the global runqueue) and heavily loaded machines (due to ++O(n) lookup) at this stage. Note that in terms of scalability, the number of ++_logical_ CPUs matters, not the number of _physical_ CPUs. Thus, a dual (2x) ++quad core (4X) hyperthreaded (2X) machine is effectively a 16X. Newer benchmark ++results are very promising indeed, without needing to tweak any knobs, features ++or options. Benchmark contributions are most welcome. ++ ++ ++Features ++ ++As the initial prime target audience for BFS was the average desktop user, it ++was designed to not need tweaking, tuning or have features set to obtain benefit ++from it. Thus the number of knobs and features has been kept to an absolute ++minimum and should not require extra user input for the vast majority of cases. ++There are precisely 2 tunables, and 2 extra scheduling policies. The rr_interval ++and iso_cpu tunables, and the SCHED_ISO and SCHED_IDLEPRIO policies. In addition ++to this, BFS also uses sub-tick accounting. What BFS does _not_ now feature is ++support for CGROUPS. The average user should neither need to know what these ++are, nor should they need to be using them to have good desktop behaviour. ++ ++rr_interval ++ ++There is only one "scheduler" tunable, the round robin interval. This can be ++accessed in ++ ++ /proc/sys/kernel/rr_interval ++ ++The value is in milliseconds, and the default value is set to 6 on a ++uniprocessor machine, and automatically set to a progressively higher value on ++multiprocessor machines. The reasoning behind increasing the value on more CPUs ++is that the effective latency is decreased by virtue of there being more CPUs on ++BFS (for reasons explained above), and increasing the value allows for less ++cache contention and more throughput. Valid values are from 1 to 1000 ++Decreasing the value will decrease latencies at the cost of decreasing ++throughput, while increasing it will improve throughput, but at the cost of ++worsening latencies. The accuracy of the rr interval is limited by HZ resolution ++of the kernel configuration. Thus, the worst case latencies are usually slightly ++higher than this actual value. The default value of 6 is not an arbitrary one. ++It is based on the fact that humans can detect jitter at approximately 7ms, so ++aiming for much lower latencies is pointless under most circumstances. It is ++worth noting this fact when comparing the latency performance of BFS to other ++schedulers. Worst case latencies being higher than 7ms are far worse than ++average latencies not being in the microsecond range. ++ ++Isochronous scheduling. ++ ++Isochronous scheduling is a unique scheduling policy designed to provide ++near-real-time performance to unprivileged (ie non-root) users without the ++ability to starve the machine indefinitely. Isochronous tasks (which means ++"same time") are set using, for example, the schedtool application like so: ++ ++ schedtool -I -e amarok ++ ++This will start the audio application "amarok" as SCHED_ISO. How SCHED_ISO works ++is that it has a priority level between true realtime tasks and SCHED_NORMAL ++which would allow them to preempt all normal tasks, in a SCHED_RR fashion (ie, ++if multiple SCHED_ISO tasks are running, they purely round robin at rr_interval ++rate). However if ISO tasks run for more than a tunable finite amount of time, ++they are then demoted back to SCHED_NORMAL scheduling. This finite amount of ++time is the percentage of _total CPU_ available across the machine, configurable ++as a percentage in the following "resource handling" tunable (as opposed to a ++scheduler tunable): ++ ++ /proc/sys/kernel/iso_cpu ++ ++and is set to 70% by default. It is calculated over a rolling 5 second average ++Because it is the total CPU available, it means that on a multi CPU machine, it ++is possible to have an ISO task running as realtime scheduling indefinitely on ++just one CPU, as the other CPUs will be available. Setting this to 100 is the ++equivalent of giving all users SCHED_RR access and setting it to 0 removes the ++ability to run any pseudo-realtime tasks. ++ ++A feature of BFS is that it detects when an application tries to obtain a ++realtime policy (SCHED_RR or SCHED_FIFO) and the caller does not have the ++appropriate privileges to use those policies. When it detects this, it will ++give the task SCHED_ISO policy instead. Thus it is transparent to the user. ++Because some applications constantly set their policy as well as their nice ++level, there is potential for them to undo the override specified by the user ++on the command line of setting the policy to SCHED_ISO. To counter this, once ++a task has been set to SCHED_ISO policy, it needs superuser privileges to set ++it back to SCHED_NORMAL. This will ensure the task remains ISO and all child ++processes and threads will also inherit the ISO policy. ++ ++Idleprio scheduling. ++ ++Idleprio scheduling is a scheduling policy designed to give out CPU to a task ++_only_ when the CPU would be otherwise idle. The idea behind this is to allow ++ultra low priority tasks to be run in the background that have virtually no ++effect on the foreground tasks. This is ideally suited to distributed computing ++clients (like setiathome, folding, mprime etc) but can also be used to start ++a video encode or so on without any slowdown of other tasks. To avoid this ++policy from grabbing shared resources and holding them indefinitely, if it ++detects a state where the task is waiting on I/O, the machine is about to ++suspend to ram and so on, it will transiently schedule them as SCHED_NORMAL. As ++per the Isochronous task management, once a task has been scheduled as IDLEPRIO, ++it cannot be put back to SCHED_NORMAL without superuser privileges. Tasks can ++be set to start as SCHED_IDLEPRIO with the schedtool command like so: ++ ++ schedtool -D -e ./mprime ++ ++Subtick accounting. ++ ++It is surprisingly difficult to get accurate CPU accounting, and in many cases, ++the accounting is done by simply determining what is happening at the precise ++moment a timer tick fires off. This becomes increasingly inaccurate as the ++timer tick frequency (HZ) is lowered. It is possible to create an application ++which uses almost 100% CPU, yet by being descheduled at the right time, records ++zero CPU usage. While the main problem with this is that there are possible ++security implications, it is also difficult to determine how much CPU a task ++really does use. BFS tries to use the sub-tick accounting from the TSC clock, ++where possible, to determine real CPU usage. This is not entirely reliable, but ++is far more likely to produce accurate CPU usage data than the existing designs ++and will not show tasks as consuming no CPU usage when they actually are. Thus, ++the amount of CPU reported as being used by BFS will more accurately represent ++how much CPU the task itself is using (as is shown for example by the 'time' ++application), so the reported values may be quite different to other schedulers. ++Values reported as the 'load' are more prone to problems with this design, but ++per process values are closer to real usage. When comparing throughput of BFS ++to other designs, it is important to compare the actual completed work in terms ++of total wall clock time taken and total work done, rather than the reported ++"cpu usage". ++ ++ ++Con Kolivas Fri Aug 27 2010 +diff --git a/Documentation/scheduler/sched-MuQSS.txt b/Documentation/scheduler/sched-MuQSS.txt +new file mode 100644 +index 000000000000..ae28b85c9995 +--- /dev/null ++++ b/Documentation/scheduler/sched-MuQSS.txt +@@ -0,0 +1,373 @@ ++MuQSS - The Multiple Queue Skiplist Scheduler by Con Kolivas. ++ ++MuQSS is a per-cpu runqueue variant of the original BFS scheduler with ++one 8 level skiplist per runqueue, and fine grained locking for much more ++scalability. ++ ++ ++Goals. ++ ++The goal of the Multiple Queue Skiplist Scheduler, referred to as MuQSS from ++here on (pronounced mux) is to completely do away with the complex designs of ++the past for the cpu process scheduler and instead implement one that is very ++simple in basic design. The main focus of MuQSS is to achieve excellent desktop ++interactivity and responsiveness without heuristics and tuning knobs that are ++difficult to understand, impossible to model and predict the effect of, and when ++tuned to one workload cause massive detriment to another, while still being ++scalable to many CPUs and processes. ++ ++ ++Design summary. ++ ++MuQSS is best described as per-cpu multiple runqueue, O(log n) insertion, O(1) ++lookup, earliest effective virtual deadline first tickless design, loosely based ++on EEVDF (earliest eligible virtual deadline first) and my previous Staircase ++Deadline scheduler, and evolved from the single runqueue O(n) BFS scheduler. ++Each component shall be described in order to understand the significance of, ++and reasoning for it. ++ ++ ++Design reasoning. ++ ++In BFS, the use of a single runqueue across all CPUs meant that each CPU would ++need to scan the entire runqueue looking for the process with the earliest ++deadline and schedule that next, regardless of which CPU it originally came ++from. This made BFS deterministic with respect to latency and provided ++guaranteed latencies dependent on number of processes and CPUs. The single ++runqueue, however, meant that all CPUs would compete for the single lock ++protecting it, which would lead to increasing lock contention as the number of ++CPUs rose and appeared to limit scalability of common workloads beyond 16 ++logical CPUs. Additionally, the O(n) lookup of the runqueue list obviously ++increased overhead proportionate to the number of queued proecesses and led to ++cache thrashing while iterating over the linked list. ++ ++MuQSS is an evolution of BFS, designed to maintain the same scheduling ++decision mechanism and be virtually deterministic without relying on the ++constrained design of the single runqueue by splitting out the single runqueue ++to be per-CPU and use skiplists instead of linked lists. ++ ++The original reason for going back to a single runqueue design for BFS was that ++once multiple runqueues are introduced, per-CPU or otherwise, there will be ++complex interactions as each runqueue will be responsible for the scheduling ++latency and fairness of the tasks only on its own runqueue, and to achieve ++fairness and low latency across multiple CPUs, any advantage in throughput of ++having CPU local tasks causes other disadvantages. This is due to requiring a ++very complex balancing system to at best achieve some semblance of fairness ++across CPUs and can only maintain relatively low latency for tasks bound to the ++same CPUs, not across them. To increase said fairness and latency across CPUs, ++the advantage of local runqueue locking, which makes for better scalability, is ++lost due to having to grab multiple locks. ++ ++MuQSS works around the problems inherent in multiple runqueue designs by ++making its skip lists priority ordered and through novel use of lockless ++examination of each other runqueue it can decide if it should take the earliest ++deadline task from another runqueue for latency reasons, or for CPU balancing ++reasons. It still does not have a balancing system, choosing to allow the ++next task scheduling decision and task wakeup CPU choice to allow balancing to ++happen by virtue of its choices. ++ ++As a further evolution of the design, MuQSS normally configures sharing of ++runqueues in a logical fashion for when CPU resources are shared for improved ++latency and throughput. By default it shares runqueues and locks between ++multicore siblings. Optionally it can be configured to run with sharing of ++SMT siblings only, all SMP packages or no sharing at all. Additionally it can ++be selected at boot time. ++ ++ ++Design details. ++ ++Custom skip list implementation: ++ ++To avoid the overhead of building up and tearing down skip list structures, ++the variant used by MuQSS has a number of optimisations making it specific for ++its use case in the scheduler. It uses static arrays of 8 'levels' instead of ++building up and tearing down structures dynamically. This makes each runqueue ++only scale O(log N) up to 64k tasks. However as there is one runqueue per CPU ++it means that it scales O(log N) up to 64k x number of logical CPUs which is ++far beyond the realistic task limits each CPU could handle. By being 8 levels ++it also makes the array exactly one cacheline in size. Additionally, each ++skip list node is bidirectional making insertion and removal amortised O(1), ++being O(k) where k is 1-8. Uniquely, we are only ever interested in the very ++first entry in each list at all times with MuQSS, so there is never a need to ++do a search and thus look up is always O(1). In interactive mode, the queues ++will be searched beyond their first entry if the first task is not suitable ++for affinity or SMT nice reasons. ++ ++Task insertion: ++ ++MuQSS inserts tasks into a per CPU runqueue as an O(log N) insertion into ++a custom skip list as described above (based on the original design by William ++Pugh). Insertion is ordered in such a way that there is never a need to do a ++search by ordering tasks according to static priority primarily, and then ++virtual deadline at the time of insertion. ++ ++Niffies: ++ ++Niffies are a monotonic forward moving timer not unlike the "jiffies" but are ++of nanosecond resolution. Niffies are calculated per-runqueue from the high ++resolution TSC timers, and in order to maintain fairness are synchronised ++between CPUs whenever both runqueues are locked concurrently. ++ ++Virtual deadline: ++ ++The key to achieving low latency, scheduling fairness, and "nice level" ++distribution in MuQSS is entirely in the virtual deadline mechanism. The one ++tunable in MuQSS is the rr_interval, or "round robin interval". This is the ++maximum time two SCHED_OTHER (or SCHED_NORMAL, the common scheduling policy) ++tasks of the same nice level will be running for, or looking at it the other ++way around, the longest duration two tasks of the same nice level will be ++delayed for. When a task requests cpu time, it is given a quota (time_slice) ++equal to the rr_interval and a virtual deadline. The virtual deadline is ++offset from the current time in niffies by this equation: ++ ++ niffies + (prio_ratio * rr_interval) ++ ++The prio_ratio is determined as a ratio compared to the baseline of nice -20 ++and increases by 10% per nice level. The deadline is a virtual one only in that ++no guarantee is placed that a task will actually be scheduled by this time, but ++it is used to compare which task should go next. There are three components to ++how a task is next chosen. First is time_slice expiration. If a task runs out ++of its time_slice, it is descheduled, the time_slice is refilled, and the ++deadline reset to that formula above. Second is sleep, where a task no longer ++is requesting CPU for whatever reason. The time_slice and deadline are _not_ ++adjusted in this case and are just carried over for when the task is next ++scheduled. Third is preemption, and that is when a newly waking task is deemed ++higher priority than a currently running task on any cpu by virtue of the fact ++that it has an earlier virtual deadline than the currently running task. The ++earlier deadline is the key to which task is next chosen for the first and ++second cases. ++ ++The CPU proportion of different nice tasks works out to be approximately the ++ ++ (prio_ratio difference)^2 ++ ++The reason it is squared is that a task's deadline does not change while it is ++running unless it runs out of time_slice. Thus, even if the time actually ++passes the deadline of another task that is queued, it will not get CPU time ++unless the current running task deschedules, and the time "base" (niffies) is ++constantly moving. ++ ++Task lookup: ++ ++As tasks are already pre-ordered according to anticipated scheduling order in ++the skip lists, lookup for the next suitable task per-runqueue is always a ++matter of simply selecting the first task in the 0th level skip list entry. ++In order to maintain optimal latency and fairness across CPUs, MuQSS does a ++novel examination of every other runqueue in cache locality order, choosing the ++best task across all runqueues. This provides near-determinism of how long any ++task across the entire system may wait before receiving CPU time. The other ++runqueues are first examine lockless and then trylocked to minimise the ++potential lock contention if they are likely to have a suitable better task. ++Each other runqueue lock is only held for as long as it takes to examine the ++entry for suitability. In "interactive" mode, the default setting, MuQSS will ++look for the best deadline task across all CPUs, while in !interactive mode, ++it will only select a better deadline task from another CPU if it is more ++heavily laden than the current one. ++ ++Lookup is therefore O(k) where k is number of CPUs. ++ ++ ++Latency. ++ ++Through the use of virtual deadlines to govern the scheduling order of normal ++tasks, queue-to-activation latency per runqueue is guaranteed to be bound by ++the rr_interval tunable which is set to 6ms by default. This means that the ++longest a CPU bound task will wait for more CPU is proportional to the number ++of running tasks and in the common case of 0-2 running tasks per CPU, will be ++under the 7ms threshold for human perception of jitter. Additionally, as newly ++woken tasks will have an early deadline from their previous runtime, the very ++tasks that are usually latency sensitive will have the shortest interval for ++activation, usually preempting any existing CPU bound tasks. ++ ++Tickless expiry: ++ ++A feature of MuQSS is that it is not tied to the resolution of the chosen tick ++rate in Hz, instead depending entirely on the high resolution timers where ++possible for sub-millisecond accuracy on timeouts regarless of the underlying ++tick rate. This allows MuQSS to be run with the low overhead of low Hz rates ++such as 100 by default, benefiting from the improved throughput and lower ++power usage it provides. Another advantage of this approach is that in ++combination with the Full No HZ option, which disables ticks on running task ++CPUs instead of just idle CPUs, the tick can be disabled at all times ++regardless of how many tasks are running instead of being limited to just one ++running task. Note that this option is NOT recommended for regular desktop ++users. ++ ++ ++Scalability and balancing. ++ ++Unlike traditional approaches where balancing is a combination of CPU selection ++at task wakeup and intermittent balancing based on a vast array of rules set ++according to architecture, busyness calculations and special case management, ++MuQSS indirectly balances on the fly at task wakeup and next task selection. ++During initialisation, MuQSS creates a cache coherency ordered list of CPUs for ++each logical CPU and uses this to aid task/CPU selection when CPUs are busy. ++Additionally it selects any idle CPUs, if they are available, at any time over ++busy CPUs according to the following preference: ++ ++ * Same thread, idle or busy cache, idle or busy threads ++ * Other core, same cache, idle or busy cache, idle threads. ++ * Same node, other CPU, idle cache, idle threads. ++ * Same node, other CPU, busy cache, idle threads. ++ * Other core, same cache, busy threads. ++ * Same node, other CPU, busy threads. ++ * Other node, other CPU, idle cache, idle threads. ++ * Other node, other CPU, busy cache, idle threads. ++ * Other node, other CPU, busy threads. ++ ++Mux is therefore SMT, MC and Numa aware without the need for extra ++intermittent balancing to maintain CPUs busy and make the most of cache ++coherency. ++ ++ ++Features ++ ++As the initial prime target audience for MuQSS was the average desktop user, it ++was designed to not need tweaking, tuning or have features set to obtain benefit ++from it. Thus the number of knobs and features has been kept to an absolute ++minimum and should not require extra user input for the vast majority of cases. ++There are 3 optional tunables, and 2 extra scheduling policies. The rr_interval, ++interactive, and iso_cpu tunables, and the SCHED_ISO and SCHED_IDLEPRIO ++policies. In addition to this, MuQSS also uses sub-tick accounting. What MuQSS ++does _not_ now feature is support for CGROUPS. The average user should neither ++need to know what these are, nor should they need to be using them to have good ++desktop behaviour. However since some applications refuse to work without ++cgroups, one can enable them with MuQSS as a stub and the filesystem will be ++created which will allow the applications to work. ++ ++rr_interval: ++ ++ /proc/sys/kernel/rr_interval ++ ++The value is in milliseconds, and the default value is set to 6. Valid values ++are from 1 to 1000 Decreasing the value will decrease latencies at the cost of ++decreasing throughput, while increasing it will improve throughput, but at the ++cost of worsening latencies. It is based on the fact that humans can detect ++jitter at approximately 7ms, so aiming for much lower latencies is pointless ++under most circumstances. It is worth noting this fact when comparing the ++latency performance of MuQSS to other schedulers. Worst case latencies being ++higher than 7ms are far worse than average latencies not being in the ++microsecond range. ++ ++interactive: ++ ++ /proc/sys/kernel/interactive ++ ++The value is a simple boolean of 1 for on and 0 for off and is set to on by ++default. Disabling this will disable the near-determinism of MuQSS when ++selecting the next task by not examining all CPUs for the earliest deadline ++task, or which CPU to wake to, instead prioritising CPU balancing for improved ++throughput. Latency will still be bound by rr_interval, but on a per-CPU basis ++instead of across the whole system. ++ ++Runqueue sharing. ++ ++By default MuQSS chooses to share runqueue resources (specifically the skip ++list and locking) between multicore siblings. It is configurable at build time ++to select between None, SMT, MC and SMP, corresponding to no sharing, sharing ++only between simultaneous mulithreading siblings, multicore siblings, or ++symmetric multiprocessing physical packages. Additionally it can be se at ++bootime with the use of the rqshare parameter. The reason for configurability ++is that some architectures have CPUs with many multicore siblings (>= 16) ++where it may be detrimental to throughput to share runqueues and another ++sharing option may be desirable. Additionally, more sharing than usual can ++improve latency on a system-wide level at the expense of throughput if desired. ++ ++The options are: ++none, smt, mc, smp ++ ++eg: ++ rqshare=mc ++ ++Isochronous scheduling: ++ ++Isochronous scheduling is a unique scheduling policy designed to provide ++near-real-time performance to unprivileged (ie non-root) users without the ++ability to starve the machine indefinitely. Isochronous tasks (which means ++"same time") are set using, for example, the schedtool application like so: ++ ++ schedtool -I -e amarok ++ ++This will start the audio application "amarok" as SCHED_ISO. How SCHED_ISO works ++is that it has a priority level between true realtime tasks and SCHED_NORMAL ++which would allow them to preempt all normal tasks, in a SCHED_RR fashion (ie, ++if multiple SCHED_ISO tasks are running, they purely round robin at rr_interval ++rate). However if ISO tasks run for more than a tunable finite amount of time, ++they are then demoted back to SCHED_NORMAL scheduling. This finite amount of ++time is the percentage of CPU available per CPU, configurable as a percentage in ++the following "resource handling" tunable (as opposed to a scheduler tunable): ++ ++iso_cpu: ++ ++ /proc/sys/kernel/iso_cpu ++ ++and is set to 70% by default. It is calculated over a rolling 5 second average ++Because it is the total CPU available, it means that on a multi CPU machine, it ++is possible to have an ISO task running as realtime scheduling indefinitely on ++just one CPU, as the other CPUs will be available. Setting this to 100 is the ++equivalent of giving all users SCHED_RR access and setting it to 0 removes the ++ability to run any pseudo-realtime tasks. ++ ++A feature of MuQSS is that it detects when an application tries to obtain a ++realtime policy (SCHED_RR or SCHED_FIFO) and the caller does not have the ++appropriate privileges to use those policies. When it detects this, it will ++give the task SCHED_ISO policy instead. Thus it is transparent to the user. ++ ++ ++Idleprio scheduling: ++ ++Idleprio scheduling is a scheduling policy designed to give out CPU to a task ++_only_ when the CPU would be otherwise idle. The idea behind this is to allow ++ultra low priority tasks to be run in the background that have virtually no ++effect on the foreground tasks. This is ideally suited to distributed computing ++clients (like setiathome, folding, mprime etc) but can also be used to start a ++video encode or so on without any slowdown of other tasks. To avoid this policy ++from grabbing shared resources and holding them indefinitely, if it detects a ++state where the task is waiting on I/O, the machine is about to suspend to ram ++and so on, it will transiently schedule them as SCHED_NORMAL. Once a task has ++been scheduled as IDLEPRIO, it cannot be put back to SCHED_NORMAL without ++superuser privileges since it is effectively a lower scheduling policy. Tasks ++can be set to start as SCHED_IDLEPRIO with the schedtool command like so: ++ ++schedtool -D -e ./mprime ++ ++Subtick accounting: ++ ++It is surprisingly difficult to get accurate CPU accounting, and in many cases, ++the accounting is done by simply determining what is happening at the precise ++moment a timer tick fires off. This becomes increasingly inaccurate as the timer ++tick frequency (HZ) is lowered. It is possible to create an application which ++uses almost 100% CPU, yet by being descheduled at the right time, records zero ++CPU usage. While the main problem with this is that there are possible security ++implications, it is also difficult to determine how much CPU a task really does ++use. Mux uses sub-tick accounting from the TSC clock to determine real CPU ++usage. Thus, the amount of CPU reported as being used by MuQSS will more ++accurately represent how much CPU the task itself is using (as is shown for ++example by the 'time' application), so the reported values may be quite ++different to other schedulers. When comparing throughput of MuQSS to other ++designs, it is important to compare the actual completed work in terms of total ++wall clock time taken and total work done, rather than the reported "cpu usage". ++ ++Symmetric MultiThreading (SMT) aware nice: ++ ++SMT, a.k.a. hyperthreading, is a very common feature on modern CPUs. While the ++logical CPU count rises by adding thread units to each CPU core, allowing more ++than one task to be run simultaneously on the same core, the disadvantage of it ++is that the CPU power is shared between the tasks, not summating to the power ++of two CPUs. The practical upshot of this is that two tasks running on ++separate threads of the same core run significantly slower than if they had one ++core each to run on. While smart CPU selection allows each task to have a core ++to itself whenever available (as is done on MuQSS), it cannot offset the ++slowdown that occurs when the cores are all loaded and only a thread is left. ++Most of the time this is harmless as the CPU is effectively overloaded at this ++point and the extra thread is of benefit. However when running a niced task in ++the presence of an un-niced task (say nice 19 v nice 0), the nice task gets ++precisely the same amount of CPU power as the unniced one. MuQSS has an ++optional configuration feature known as SMT-NICE which selectively idles the ++secondary niced thread for a period proportional to the nice difference, ++allowing CPU distribution according to nice level to be maintained, at the ++expense of a small amount of extra overhead. If this is configured in on a ++machine without SMT threads, the overhead is minimal. ++ ++ ++Con Kolivas Sat, 29th October 2016 +diff --git a/Makefile b/Makefile +index 6a01b073915e..dd1876725cef 100644 +--- a/Makefile ++++ b/Makefile +@@ -15,6 +15,10 @@ NAME = Kleptomaniac Octopus + PHONY := _all + _all: + ++CKVERSION = -ck1 ++CKNAME = MuQSS Powered ++EXTRAVERSION := $(EXTRAVERSION)$(CKVERSION) ++ + # We are using a recursive build, so we need to do a little thinking + # to get the ordering right. + # +diff --git a/arch/alpha/Kconfig b/arch/alpha/Kconfig +index ef179033a7c2..14b576a531ad 100644 +--- a/arch/alpha/Kconfig ++++ b/arch/alpha/Kconfig +@@ -665,6 +665,8 @@ config HZ + default 1200 if HZ_1200 + default 1024 + ++source "kernel/Kconfig.MuQSS" ++ + config SRM_ENV + tristate "SRM environment through procfs" + depends on PROC_FS +diff --git a/arch/arc/configs/tb10x_defconfig b/arch/arc/configs/tb10x_defconfig +index a12656ec0072..b46b6ddc7636 100644 +--- a/arch/arc/configs/tb10x_defconfig ++++ b/arch/arc/configs/tb10x_defconfig +@@ -29,7 +29,7 @@ CONFIG_ARC_PLAT_TB10X=y + CONFIG_ARC_CACHE_LINE_SHIFT=5 + CONFIG_HZ=250 + CONFIG_ARC_BUILTIN_DTB_NAME="abilis_tb100_dvk" +-CONFIG_PREEMPT_VOLUNTARY=y ++CONFIG_PREEMPT=y + # CONFIG_COMPACTION is not set + CONFIG_NET=y + CONFIG_PACKET=y +diff --git a/arch/arm/Kconfig b/arch/arm/Kconfig +index 96dab76da3b3..78611fa96178 100644 +--- a/arch/arm/Kconfig ++++ b/arch/arm/Kconfig +@@ -1237,6 +1237,8 @@ config SCHED_SMT + MultiThreading at a cost of slightly increased overhead in some + places. If unsure say N here. + ++source "kernel/Kconfig.MuQSS" ++ + config HAVE_ARM_SCU + bool + help +diff --git a/arch/arm/configs/bcm2835_defconfig b/arch/arm/configs/bcm2835_defconfig +index 519ff58e67b3..b2a05b6f7d80 100644 +--- a/arch/arm/configs/bcm2835_defconfig ++++ b/arch/arm/configs/bcm2835_defconfig +@@ -29,7 +29,7 @@ CONFIG_MODULE_UNLOAD=y + CONFIG_ARCH_MULTI_V6=y + CONFIG_ARCH_BCM=y + CONFIG_ARCH_BCM2835=y +-CONFIG_PREEMPT_VOLUNTARY=y ++CONFIG_PREEMPT=y + CONFIG_AEABI=y + CONFIG_KSM=y + CONFIG_CLEANCACHE=y +diff --git a/arch/arm/configs/imx_v6_v7_defconfig b/arch/arm/configs/imx_v6_v7_defconfig +index 3608e55eaecd..614a0a30cc70 100644 +--- a/arch/arm/configs/imx_v6_v7_defconfig ++++ b/arch/arm/configs/imx_v6_v7_defconfig +@@ -45,6 +45,7 @@ CONFIG_PCI_MSI=y + CONFIG_PCI_IMX6=y + CONFIG_SMP=y + CONFIG_ARM_PSCI=y ++CONFIG_PREEMPT=y + CONFIG_HIGHMEM=y + CONFIG_FORCE_MAX_ZONEORDER=14 + CONFIG_CMDLINE="noinitrd console=ttymxc0,115200" +diff --git a/arch/arm/configs/mps2_defconfig b/arch/arm/configs/mps2_defconfig +index 1d923dbb9928..9c1931f1fafd 100644 +--- a/arch/arm/configs/mps2_defconfig ++++ b/arch/arm/configs/mps2_defconfig +@@ -18,7 +18,7 @@ CONFIG_ARCH_MPS2=y + CONFIG_SET_MEM_PARAM=y + CONFIG_DRAM_BASE=0x21000000 + CONFIG_DRAM_SIZE=0x1000000 +-CONFIG_PREEMPT_VOLUNTARY=y ++CONFIG_PREEMPT=y + # CONFIG_ATAGS is not set + CONFIG_ZBOOT_ROM_TEXT=0x0 + CONFIG_ZBOOT_ROM_BSS=0x0 +diff --git a/arch/arm/configs/mxs_defconfig b/arch/arm/configs/mxs_defconfig +index 2773899c21b3..870866aaa39d 100644 +--- a/arch/arm/configs/mxs_defconfig ++++ b/arch/arm/configs/mxs_defconfig +@@ -1,7 +1,7 @@ + CONFIG_SYSVIPC=y + CONFIG_NO_HZ=y + CONFIG_HIGH_RES_TIMERS=y +-CONFIG_PREEMPT_VOLUNTARY=y ++CONFIG_PREEMPT_VOLUNTARY=n + CONFIG_TASKSTATS=y + CONFIG_TASK_DELAY_ACCT=y + CONFIG_TASK_XACCT=y +@@ -27,6 +27,11 @@ CONFIG_MODVERSIONS=y + CONFIG_MODULE_FORCE_UNLOAD=y + CONFIG_MODVERSIONS=y + CONFIG_BLK_DEV_INTEGRITY=y ++# CONFIG_ARCH_MULTI_V7 is not set ++CONFIG_ARCH_MXS=y ++# CONFIG_ARM_THUMB is not set ++CONFIG_PREEMPT=y ++CONFIG_AEABI=y + CONFIG_NET=y + CONFIG_PACKET=y + CONFIG_UNIX=y +diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig +index e688dfad0b72..000a2e0d01ea 100644 +--- a/arch/arm64/Kconfig ++++ b/arch/arm64/Kconfig +@@ -911,6 +911,8 @@ config SCHED_SMT + MultiThreading at a cost of slightly increased overhead in some + places. If unsure say N here. + ++source "kernel/Kconfig.MuQSS" ++ + config NR_CPUS + int "Maximum number of CPUs (2-4096)" + range 2 4096 +diff --git a/arch/mips/configs/fuloong2e_defconfig b/arch/mips/configs/fuloong2e_defconfig +index 1788ae23bff9..8daf4e1eebcd 100644 +--- a/arch/mips/configs/fuloong2e_defconfig ++++ b/arch/mips/configs/fuloong2e_defconfig +@@ -4,7 +4,7 @@ CONFIG_SYSVIPC=y + CONFIG_POSIX_MQUEUE=y + CONFIG_NO_HZ=y + CONFIG_HIGH_RES_TIMERS=y +-CONFIG_PREEMPT_VOLUNTARY=y ++CONFIG_PREEMPT=y + CONFIG_BSD_PROCESS_ACCT=y + CONFIG_IKCONFIG=y + CONFIG_IKCONFIG_PROC=y +diff --git a/arch/mips/configs/gpr_defconfig b/arch/mips/configs/gpr_defconfig +index 9085f4d6c698..fb23111d45f6 100644 +--- a/arch/mips/configs/gpr_defconfig ++++ b/arch/mips/configs/gpr_defconfig +@@ -1,8 +1,8 @@ ++CONFIG_PREEMPT=y + # CONFIG_LOCALVERSION_AUTO is not set + CONFIG_SYSVIPC=y + CONFIG_POSIX_MQUEUE=y + CONFIG_HIGH_RES_TIMERS=y +-CONFIG_PREEMPT_VOLUNTARY=y + CONFIG_BSD_PROCESS_ACCT=y + CONFIG_BSD_PROCESS_ACCT_V3=y + CONFIG_RELAY=y +diff --git a/arch/mips/configs/ip22_defconfig b/arch/mips/configs/ip22_defconfig +index 21a1168ae301..529a1b1007cf 100644 +--- a/arch/mips/configs/ip22_defconfig ++++ b/arch/mips/configs/ip22_defconfig +@@ -1,7 +1,7 @@ + CONFIG_SYSVIPC=y + CONFIG_NO_HZ=y + CONFIG_HIGH_RES_TIMERS=y +-CONFIG_PREEMPT_VOLUNTARY=y ++CONFIG_PREEMPT=y + CONFIG_IKCONFIG=y + CONFIG_IKCONFIG_PROC=y + CONFIG_LOG_BUF_SHIFT=14 +diff --git a/arch/mips/configs/ip28_defconfig b/arch/mips/configs/ip28_defconfig +index 0921ef38e9fb..6da05cef46f8 100644 +--- a/arch/mips/configs/ip28_defconfig ++++ b/arch/mips/configs/ip28_defconfig +@@ -1,5 +1,5 @@ + CONFIG_SYSVIPC=y +-CONFIG_PREEMPT_VOLUNTARY=y ++CONFIG_PREEMPT=y + CONFIG_IKCONFIG=y + CONFIG_IKCONFIG_PROC=y + CONFIG_LOG_BUF_SHIFT=14 +diff --git a/arch/mips/configs/jazz_defconfig b/arch/mips/configs/jazz_defconfig +index 328d4dfeb4cb..e17cb23173ea 100644 +--- a/arch/mips/configs/jazz_defconfig ++++ b/arch/mips/configs/jazz_defconfig +@@ -1,6 +1,6 @@ ++CONFIG_PREEMPT=y + CONFIG_SYSVIPC=y + CONFIG_POSIX_MQUEUE=y +-CONFIG_PREEMPT_VOLUNTARY=y + CONFIG_BSD_PROCESS_ACCT=y + CONFIG_IKCONFIG=y + CONFIG_IKCONFIG_PROC=y +diff --git a/arch/mips/configs/mtx1_defconfig b/arch/mips/configs/mtx1_defconfig +index 914af125a7fa..76a64290373f 100644 +--- a/arch/mips/configs/mtx1_defconfig ++++ b/arch/mips/configs/mtx1_defconfig +@@ -1,8 +1,8 @@ ++CONFIG_PREEMPT=y + # CONFIG_LOCALVERSION_AUTO is not set + CONFIG_SYSVIPC=y + CONFIG_POSIX_MQUEUE=y + CONFIG_AUDIT=y +-CONFIG_PREEMPT_VOLUNTARY=y + CONFIG_BSD_PROCESS_ACCT=y + CONFIG_BSD_PROCESS_ACCT_V3=y + CONFIG_RELAY=y +diff --git a/arch/mips/configs/nlm_xlr_defconfig b/arch/mips/configs/nlm_xlr_defconfig +index 4ecb157e56d4..ea7309283b01 100644 +--- a/arch/mips/configs/nlm_xlr_defconfig ++++ b/arch/mips/configs/nlm_xlr_defconfig +@@ -1,10 +1,10 @@ ++CONFIG_PREEMPT=y + # CONFIG_LOCALVERSION_AUTO is not set + CONFIG_SYSVIPC=y + CONFIG_POSIX_MQUEUE=y + CONFIG_AUDIT=y + CONFIG_NO_HZ=y + CONFIG_HIGH_RES_TIMERS=y +-CONFIG_PREEMPT_VOLUNTARY=y + CONFIG_BSD_PROCESS_ACCT=y + CONFIG_BSD_PROCESS_ACCT_V3=y + CONFIG_TASKSTATS=y +diff --git a/arch/mips/configs/pic32mzda_defconfig b/arch/mips/configs/pic32mzda_defconfig +index 63fe2da1b37f..7f08ee237345 100644 +--- a/arch/mips/configs/pic32mzda_defconfig ++++ b/arch/mips/configs/pic32mzda_defconfig +@@ -1,7 +1,7 @@ ++CONFIG_PREEMPT=y + CONFIG_SYSVIPC=y + CONFIG_NO_HZ=y + CONFIG_HIGH_RES_TIMERS=y +-CONFIG_PREEMPT_VOLUNTARY=y + CONFIG_IKCONFIG=y + CONFIG_IKCONFIG_PROC=y + CONFIG_LOG_BUF_SHIFT=14 +diff --git a/arch/mips/configs/pistachio_defconfig b/arch/mips/configs/pistachio_defconfig +index 24e07180c57d..38582e8f71c4 100644 +--- a/arch/mips/configs/pistachio_defconfig ++++ b/arch/mips/configs/pistachio_defconfig +@@ -1,9 +1,9 @@ ++CONFIG_PREEMPT=y + # CONFIG_LOCALVERSION_AUTO is not set + CONFIG_DEFAULT_HOSTNAME="localhost" + CONFIG_SYSVIPC=y + CONFIG_NO_HZ=y + CONFIG_HIGH_RES_TIMERS=y +-CONFIG_PREEMPT_VOLUNTARY=y + CONFIG_IKCONFIG=m + CONFIG_IKCONFIG_PROC=y + CONFIG_LOG_BUF_SHIFT=18 +diff --git a/arch/mips/configs/pnx8335_stb225_defconfig b/arch/mips/configs/pnx8335_stb225_defconfig +index 738ba3b1374b..6a3267e8aa0d 100644 +--- a/arch/mips/configs/pnx8335_stb225_defconfig ++++ b/arch/mips/configs/pnx8335_stb225_defconfig +@@ -1,9 +1,9 @@ ++CONFIG_PREEMPT=y + # CONFIG_LOCALVERSION_AUTO is not set + # CONFIG_SWAP is not set + CONFIG_SYSVIPC=y + CONFIG_NO_HZ=y + CONFIG_HIGH_RES_TIMERS=y +-CONFIG_PREEMPT_VOLUNTARY=y + CONFIG_LOG_BUF_SHIFT=14 + CONFIG_EXPERT=y + CONFIG_SLAB=y +diff --git a/arch/mips/configs/rm200_defconfig b/arch/mips/configs/rm200_defconfig +index 2c7adea7638f..1c82d62bee72 100644 +--- a/arch/mips/configs/rm200_defconfig ++++ b/arch/mips/configs/rm200_defconfig +@@ -1,6 +1,6 @@ ++CONFIG_PREEMPT=y + CONFIG_SYSVIPC=y + CONFIG_POSIX_MQUEUE=y +-CONFIG_PREEMPT_VOLUNTARY=y + CONFIG_BSD_PROCESS_ACCT=y + CONFIG_IKCONFIG=y + CONFIG_IKCONFIG_PROC=y +diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig +index e2a412113359..a8fa2e3c05a6 100644 +--- a/arch/powerpc/Kconfig ++++ b/arch/powerpc/Kconfig +@@ -878,6 +878,8 @@ config SCHED_SMT + when dealing with POWER5 cpus at a cost of slightly increased + overhead in some places. If unsure say N here. + ++source "kernel/Kconfig.MuQSS" ++ + config PPC_DENORMALISATION + bool "PowerPC denormalisation exception handling" + depends on PPC_BOOK3S_64 +diff --git a/arch/powerpc/configs/ppc6xx_defconfig b/arch/powerpc/configs/ppc6xx_defconfig +index 7e28919041cf..b22d0a7c2751 100644 +--- a/arch/powerpc/configs/ppc6xx_defconfig ++++ b/arch/powerpc/configs/ppc6xx_defconfig +@@ -74,7 +74,7 @@ CONFIG_QE_GPIO=y + CONFIG_MCU_MPC8349EMITX=y + CONFIG_HIGHMEM=y + CONFIG_HZ_1000=y +-CONFIG_PREEMPT_VOLUNTARY=y ++CONFIG_PREEMPT=y + CONFIG_BINFMT_MISC=y + CONFIG_HIBERNATION=y + CONFIG_PM_DEBUG=y +diff --git a/arch/powerpc/platforms/cell/spufs/sched.c b/arch/powerpc/platforms/cell/spufs/sched.c +index f18d5067cd0f..fe489fc01c73 100644 +--- a/arch/powerpc/platforms/cell/spufs/sched.c ++++ b/arch/powerpc/platforms/cell/spufs/sched.c +@@ -51,11 +51,6 @@ static struct task_struct *spusched_task; + static struct timer_list spusched_timer; + static struct timer_list spuloadavg_timer; + +-/* +- * Priority of a normal, non-rt, non-niced'd process (aka nice level 0). +- */ +-#define NORMAL_PRIO 120 +- + /* + * Frequency of the spu scheduler tick. By default we do one SPU scheduler + * tick for every 10 CPU scheduler ticks. +diff --git a/arch/sh/configs/se7712_defconfig b/arch/sh/configs/se7712_defconfig +index 9a527f978106..5895f2cc726e 100644 +--- a/arch/sh/configs/se7712_defconfig ++++ b/arch/sh/configs/se7712_defconfig +@@ -23,7 +23,7 @@ CONFIG_FLATMEM_MANUAL=y + CONFIG_SH_SOLUTION_ENGINE=y + CONFIG_SH_PCLK_FREQ=66666666 + CONFIG_HEARTBEAT=y +-CONFIG_PREEMPT_VOLUNTARY=y ++CONFIG_PREEMPT=y + CONFIG_CMDLINE_OVERWRITE=y + CONFIG_CMDLINE="console=ttySC0,115200 root=/dev/sda1" + CONFIG_NET=y +diff --git a/arch/sh/configs/se7721_defconfig b/arch/sh/configs/se7721_defconfig +index 3b0e1eb6e874..e296a2cd9903 100644 +--- a/arch/sh/configs/se7721_defconfig ++++ b/arch/sh/configs/se7721_defconfig +@@ -23,7 +23,7 @@ CONFIG_FLATMEM_MANUAL=y + CONFIG_SH_7721_SOLUTION_ENGINE=y + CONFIG_SH_PCLK_FREQ=33333333 + CONFIG_HEARTBEAT=y +-CONFIG_PREEMPT_VOLUNTARY=y ++CONFIG_PREEMPT=y + CONFIG_CMDLINE_OVERWRITE=y + CONFIG_CMDLINE="console=ttySC0,115200 root=/dev/sda2" + CONFIG_NET=y +diff --git a/arch/sh/configs/titan_defconfig b/arch/sh/configs/titan_defconfig +index 4ec961ace688..a03a1ad670a0 100644 +--- a/arch/sh/configs/titan_defconfig ++++ b/arch/sh/configs/titan_defconfig +@@ -20,7 +20,7 @@ CONFIG_SH_TITAN=y + CONFIG_SH_PCLK_FREQ=30000000 + CONFIG_SH_DMA=y + CONFIG_SH_DMA_API=y +-CONFIG_PREEMPT_VOLUNTARY=y ++CONFIG_PREEMPT=y + CONFIG_CMDLINE_OVERWRITE=y + CONFIG_CMDLINE="console=ttySC1,38400N81 root=/dev/nfs ip=:::::eth1:autoconf rw" + CONFIG_PCI=y +diff --git a/arch/sparc/configs/sparc64_defconfig b/arch/sparc/configs/sparc64_defconfig +index 6c325d53a20a..98d4ef3d76cf 100644 +--- a/arch/sparc/configs/sparc64_defconfig ++++ b/arch/sparc/configs/sparc64_defconfig +@@ -22,7 +22,7 @@ CONFIG_NO_HZ=y + CONFIG_HIGH_RES_TIMERS=y + CONFIG_NUMA=y + CONFIG_DEFAULT_MMAP_MIN_ADDR=8192 +-CONFIG_PREEMPT_VOLUNTARY=y ++CONFIG_PREEMPT=y + CONFIG_SUN_LDOMS=y + CONFIG_PCI=y + CONFIG_PCI_MSI=y +diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig +index 5e8949953660..4f6afc56ff5f 100644 +--- a/arch/x86/Kconfig ++++ b/arch/x86/Kconfig +@@ -1004,6 +1004,22 @@ config NR_CPUS + config SCHED_SMT + def_bool y if SMP + ++config SMT_NICE ++ bool "SMT (Hyperthreading) aware nice priority and policy support" ++ depends on SCHED_MUQSS && SCHED_SMT ++ default y ++ ---help--- ++ Enabling Hyperthreading on Intel CPUs decreases the effectiveness ++ of the use of 'nice' levels and different scheduling policies ++ (e.g. realtime) due to sharing of CPU power between hyperthreads. ++ SMT nice support makes each logical CPU aware of what is running on ++ its hyperthread siblings, maintaining appropriate distribution of ++ CPU according to nice levels and scheduling policies at the expense ++ of slightly increased overhead. ++ ++ If unsure say Y here. ++ ++ + config SCHED_MC + def_bool y + prompt "Multi-core scheduler support" +@@ -1034,6 +1050,8 @@ config SCHED_MC_PRIO + + If unsure say Y here. + ++source "kernel/Kconfig.MuQSS" ++ + config UP_LATE_INIT + def_bool y + depends on !SMP && X86_LOCAL_APIC +@@ -1421,7 +1439,7 @@ config HIGHMEM64G + endchoice + + choice +- prompt "Memory split" if EXPERT ++ prompt "Memory split" + default VMSPLIT_3G + depends on X86_32 + ---help--- +@@ -1441,17 +1459,17 @@ choice + option alone! + + config VMSPLIT_3G +- bool "3G/1G user/kernel split" ++ bool "Default 896MB lowmem (3G/1G user/kernel split)" + config VMSPLIT_3G_OPT + depends on !X86_PAE +- bool "3G/1G user/kernel split (for full 1G low memory)" ++ bool "1GB lowmem (3G/1G user/kernel split)" + config VMSPLIT_2G +- bool "2G/2G user/kernel split" ++ bool "2GB lowmem (2G/2G user/kernel split)" + config VMSPLIT_2G_OPT + depends on !X86_PAE +- bool "2G/2G user/kernel split (for full 2G low memory)" ++ bool "2GB lowmem (2G/2G user/kernel split)" + config VMSPLIT_1G +- bool "1G/3G user/kernel split" ++ bool "3GB lowmem (1G/3G user/kernel split)" + endchoice + + config PAGE_OFFSET +diff --git a/arch/x86/configs/i386_defconfig b/arch/x86/configs/i386_defconfig +index 59ce9ed58430..f19741b0f43d 100644 +--- a/arch/x86/configs/i386_defconfig ++++ b/arch/x86/configs/i386_defconfig +@@ -29,7 +29,7 @@ CONFIG_SMP=y + CONFIG_X86_GENERIC=y + CONFIG_HPET_TIMER=y + CONFIG_SCHED_SMT=y +-CONFIG_PREEMPT_VOLUNTARY=y ++CONFIG_PREEMPT=y + CONFIG_X86_REROUTE_FOR_BROKEN_BOOT_IRQS=y + CONFIG_X86_MCE=y + CONFIG_X86_REBOOTFIXUPS=y +diff --git a/arch/x86/configs/x86_64_defconfig b/arch/x86/configs/x86_64_defconfig +index 0b9654c7a05c..eb361bdf6026 100644 +--- a/arch/x86/configs/x86_64_defconfig ++++ b/arch/x86/configs/x86_64_defconfig +@@ -27,7 +27,7 @@ CONFIG_MODULE_FORCE_UNLOAD=y + CONFIG_SMP=y + CONFIG_NR_CPUS=64 + CONFIG_SCHED_SMT=y +-CONFIG_PREEMPT_VOLUNTARY=y ++CONFIG_PREEMPT=y + CONFIG_X86_REROUTE_FOR_BROKEN_BOOT_IRQS=y + CONFIG_X86_MCE=y + CONFIG_MICROCODE=y +diff --git a/drivers/block/swim.c b/drivers/block/swim.c +index 4c297f69171d..5bc4f1be2617 100644 +--- a/drivers/block/swim.c ++++ b/drivers/block/swim.c +@@ -328,7 +328,7 @@ static inline void swim_motor(struct swim __iomem *base, + if (swim_readbit(base, MOTOR_ON)) + break; + current->state = TASK_INTERRUPTIBLE; +- schedule_timeout(1); ++ schedule_min_hrtimeout(); + } + } else if (action == OFF) { + swim_action(base, MOTOR_OFF); +@@ -347,7 +347,7 @@ static inline void swim_eject(struct swim __iomem *base) + if (!swim_readbit(base, DISK_IN)) + break; + current->state = TASK_INTERRUPTIBLE; +- schedule_timeout(1); ++ schedule_min_hrtimeout(); + } + swim_select(base, RELAX); + } +@@ -371,7 +371,7 @@ static inline int swim_step(struct swim __iomem *base) + for (wait = 0; wait < HZ; wait++) { + + current->state = TASK_INTERRUPTIBLE; +- schedule_timeout(1); ++ schedule_min_hrtimeout(); + + swim_select(base, RELAX); + if (!swim_readbit(base, STEP)) +diff --git a/drivers/char/ipmi/ipmi_msghandler.c b/drivers/char/ipmi/ipmi_msghandler.c +index cad9563f8f48..d7f93c85c2cb 100644 +--- a/drivers/char/ipmi/ipmi_msghandler.c ++++ b/drivers/char/ipmi/ipmi_msghandler.c +@@ -3537,7 +3537,7 @@ static void cleanup_smi_msgs(struct ipmi_smi *intf) + /* Current message first, to preserve order */ + while (intf->curr_msg && !list_empty(&intf->waiting_rcv_msgs)) { + /* Wait for the message to clear out. */ +- schedule_timeout(1); ++ schedule_min_hrtimeout(); + } + + /* No need for locks, the interface is down. */ +diff --git a/drivers/char/ipmi/ipmi_ssif.c b/drivers/char/ipmi/ipmi_ssif.c +index 22c6a2e61236..c4bccd444cbf 100644 +--- a/drivers/char/ipmi/ipmi_ssif.c ++++ b/drivers/char/ipmi/ipmi_ssif.c +@@ -1289,7 +1289,7 @@ static void shutdown_ssif(void *send_info) + + /* make sure the driver is not looking for flags any more. */ + while (ssif_info->ssif_state != SSIF_NORMAL) +- schedule_timeout(1); ++ schedule_min_hrtimeout(); + + ssif_info->stopping = true; + del_timer_sync(&ssif_info->watch_timer); +diff --git a/drivers/gpu/drm/vmwgfx/vmwgfx_fifo.c b/drivers/gpu/drm/vmwgfx/vmwgfx_fifo.c +index e5252ef3812f..6ae6241185ea 100644 +--- a/drivers/gpu/drm/vmwgfx/vmwgfx_fifo.c ++++ b/drivers/gpu/drm/vmwgfx/vmwgfx_fifo.c +@@ -237,7 +237,7 @@ static int vmw_fifo_wait_noirq(struct vmw_private *dev_priv, + DRM_ERROR("SVGA device lockup.\n"); + break; + } +- schedule_timeout(1); ++ schedule_min_hrtimeout(); + if (interruptible && signal_pending(current)) { + ret = -ERESTARTSYS; + break; +diff --git a/drivers/gpu/drm/vmwgfx/vmwgfx_irq.c b/drivers/gpu/drm/vmwgfx/vmwgfx_irq.c +index 75f3efee21a4..09b1932ce85b 100644 +--- a/drivers/gpu/drm/vmwgfx/vmwgfx_irq.c ++++ b/drivers/gpu/drm/vmwgfx/vmwgfx_irq.c +@@ -203,7 +203,7 @@ int vmw_fallback_wait(struct vmw_private *dev_priv, + break; + } + if (lazy) +- schedule_timeout(1); ++ schedule_min_hrtimeout(); + else if ((++count & 0x0F) == 0) { + /** + * FIXME: Use schedule_hr_timeout here for +diff --git a/drivers/hwmon/fam15h_power.c b/drivers/hwmon/fam15h_power.c +index 267eac00a3fb..352af68c6cd7 100644 +--- a/drivers/hwmon/fam15h_power.c ++++ b/drivers/hwmon/fam15h_power.c +@@ -225,7 +225,7 @@ static ssize_t power1_average_show(struct device *dev, + prev_ptsc[cu] = data->cpu_sw_pwr_ptsc[cu]; + } + +- leftover = schedule_timeout_interruptible(msecs_to_jiffies(data->power_period)); ++ leftover = schedule_msec_hrtimeout_interruptible((data->power_period)); + if (leftover) + return 0; + +diff --git a/drivers/iio/light/tsl2563.c b/drivers/iio/light/tsl2563.c +index d8c40a83097d..8332baf4961c 100644 +--- a/drivers/iio/light/tsl2563.c ++++ b/drivers/iio/light/tsl2563.c +@@ -269,11 +269,7 @@ static void tsl2563_wait_adc(struct tsl2563_chip *chip) + default: + delay = 402; + } +- /* +- * TODO: Make sure that we wait at least required delay but why we +- * have to extend it one tick more? +- */ +- schedule_timeout_interruptible(msecs_to_jiffies(delay) + 2); ++ schedule_msec_hrtimeout_interruptible(delay + 1); + } + + static int tsl2563_adjust_gainlevel(struct tsl2563_chip *chip, u16 adc) +diff --git a/drivers/media/i2c/msp3400-driver.c b/drivers/media/i2c/msp3400-driver.c +index 39530d43590e..a7caf2eb5771 100644 +--- a/drivers/media/i2c/msp3400-driver.c ++++ b/drivers/media/i2c/msp3400-driver.c +@@ -170,7 +170,7 @@ static int msp_read(struct i2c_client *client, int dev, int addr) + break; + dev_warn(&client->dev, "I/O error #%d (read 0x%02x/0x%02x)\n", err, + dev, addr); +- schedule_timeout_interruptible(msecs_to_jiffies(10)); ++ schedule_msec_hrtimeout_interruptible((10)); + } + if (err == 3) { + dev_warn(&client->dev, "resetting chip, sound will go off.\n"); +@@ -211,7 +211,7 @@ static int msp_write(struct i2c_client *client, int dev, int addr, int val) + break; + dev_warn(&client->dev, "I/O error #%d (write 0x%02x/0x%02x)\n", err, + dev, addr); +- schedule_timeout_interruptible(msecs_to_jiffies(10)); ++ schedule_msec_hrtimeout_interruptible((10)); + } + if (err == 3) { + dev_warn(&client->dev, "resetting chip, sound will go off.\n"); +diff --git a/drivers/media/pci/cx18/cx18-gpio.c b/drivers/media/pci/cx18/cx18-gpio.c +index cf7cfda94107..f63e17489547 100644 +--- a/drivers/media/pci/cx18/cx18-gpio.c ++++ b/drivers/media/pci/cx18/cx18-gpio.c +@@ -81,11 +81,11 @@ static void gpio_reset_seq(struct cx18 *cx, u32 active_lo, u32 active_hi, + + /* Assert */ + gpio_update(cx, mask, ~active_lo); +- schedule_timeout_uninterruptible(msecs_to_jiffies(assert_msecs)); ++ schedule_msec_hrtimeout_uninterruptible((assert_msecs)); + + /* Deassert */ + gpio_update(cx, mask, ~active_hi); +- schedule_timeout_uninterruptible(msecs_to_jiffies(recovery_msecs)); ++ schedule_msec_hrtimeout_uninterruptible((recovery_msecs)); + } + + /* +diff --git a/drivers/media/pci/ivtv/ivtv-gpio.c b/drivers/media/pci/ivtv/ivtv-gpio.c +index 856e7ab7f33e..766a26251337 100644 +--- a/drivers/media/pci/ivtv/ivtv-gpio.c ++++ b/drivers/media/pci/ivtv/ivtv-gpio.c +@@ -105,7 +105,7 @@ void ivtv_reset_ir_gpio(struct ivtv *itv) + curout = (curout & ~0xF) | 1; + write_reg(curout, IVTV_REG_GPIO_OUT); + /* We could use something else for smaller time */ +- schedule_timeout_interruptible(msecs_to_jiffies(1)); ++ schedule_msec_hrtimeout_interruptible((1)); + curout |= 2; + write_reg(curout, IVTV_REG_GPIO_OUT); + curdir &= ~0x80; +@@ -125,11 +125,11 @@ int ivtv_reset_tuner_gpio(void *dev, int component, int cmd, int value) + curout = read_reg(IVTV_REG_GPIO_OUT); + curout &= ~(1 << itv->card->xceive_pin); + write_reg(curout, IVTV_REG_GPIO_OUT); +- schedule_timeout_interruptible(msecs_to_jiffies(1)); ++ schedule_msec_hrtimeout_interruptible((1)); + + curout |= 1 << itv->card->xceive_pin; + write_reg(curout, IVTV_REG_GPIO_OUT); +- schedule_timeout_interruptible(msecs_to_jiffies(1)); ++ schedule_msec_hrtimeout_interruptible((1)); + return 0; + } + +diff --git a/drivers/media/pci/ivtv/ivtv-ioctl.c b/drivers/media/pci/ivtv/ivtv-ioctl.c +index 137853944e46..76830892f373 100644 +--- a/drivers/media/pci/ivtv/ivtv-ioctl.c ++++ b/drivers/media/pci/ivtv/ivtv-ioctl.c +@@ -1137,7 +1137,7 @@ void ivtv_s_std_dec(struct ivtv *itv, v4l2_std_id std) + TASK_UNINTERRUPTIBLE); + if ((read_reg(IVTV_REG_DEC_LINE_FIELD) >> 16) < 100) + break; +- schedule_timeout(msecs_to_jiffies(25)); ++ schedule_msec_hrtimeout((25)); + } + finish_wait(&itv->vsync_waitq, &wait); + mutex_lock(&itv->serialize_lock); +diff --git a/drivers/media/pci/ivtv/ivtv-streams.c b/drivers/media/pci/ivtv/ivtv-streams.c +index f7de9118f609..f39ad2952c0f 100644 +--- a/drivers/media/pci/ivtv/ivtv-streams.c ++++ b/drivers/media/pci/ivtv/ivtv-streams.c +@@ -849,7 +849,7 @@ int ivtv_stop_v4l2_encode_stream(struct ivtv_stream *s, int gop_end) + while (!test_bit(IVTV_F_I_EOS, &itv->i_flags) && + time_before(jiffies, + then + msecs_to_jiffies(2000))) { +- schedule_timeout(msecs_to_jiffies(10)); ++ schedule_msec_hrtimeout((10)); + } + + /* To convert jiffies to ms, we must multiply by 1000 +diff --git a/drivers/media/radio/radio-mr800.c b/drivers/media/radio/radio-mr800.c +index cb0437b4c331..163fffc0e1d4 100644 +--- a/drivers/media/radio/radio-mr800.c ++++ b/drivers/media/radio/radio-mr800.c +@@ -366,7 +366,7 @@ static int vidioc_s_hw_freq_seek(struct file *file, void *priv, + retval = -ENODATA; + break; + } +- if (schedule_timeout_interruptible(msecs_to_jiffies(10))) { ++ if (schedule_msec_hrtimeout_interruptible((10))) { + retval = -ERESTARTSYS; + break; + } +diff --git a/drivers/media/radio/radio-tea5777.c b/drivers/media/radio/radio-tea5777.c +index fb9de7bbcd19..e53cf45e7f3f 100644 +--- a/drivers/media/radio/radio-tea5777.c ++++ b/drivers/media/radio/radio-tea5777.c +@@ -235,7 +235,7 @@ static int radio_tea5777_update_read_reg(struct radio_tea5777 *tea, int wait) + } + + if (wait) { +- if (schedule_timeout_interruptible(msecs_to_jiffies(wait))) ++ if (schedule_msec_hrtimeout_interruptible((wait))) + return -ERESTARTSYS; + } + +diff --git a/drivers/media/radio/tea575x.c b/drivers/media/radio/tea575x.c +index b0303cf00387..0925b5065147 100644 +--- a/drivers/media/radio/tea575x.c ++++ b/drivers/media/radio/tea575x.c +@@ -401,7 +401,7 @@ int snd_tea575x_s_hw_freq_seek(struct file *file, struct snd_tea575x *tea, + for (;;) { + if (time_after(jiffies, timeout)) + break; +- if (schedule_timeout_interruptible(msecs_to_jiffies(10))) { ++ if (schedule_msec_hrtimeout_interruptible((10))) { + /* some signal arrived, stop search */ + tea->val &= ~TEA575X_BIT_SEARCH; + snd_tea575x_set_freq(tea); +diff --git a/drivers/mfd/ucb1x00-core.c b/drivers/mfd/ucb1x00-core.c +index b690796d24d4..448b13da62b4 100644 +--- a/drivers/mfd/ucb1x00-core.c ++++ b/drivers/mfd/ucb1x00-core.c +@@ -250,7 +250,7 @@ unsigned int ucb1x00_adc_read(struct ucb1x00 *ucb, int adc_channel, int sync) + break; + /* yield to other processes */ + set_current_state(TASK_INTERRUPTIBLE); +- schedule_timeout(1); ++ schedule_min_hrtimeout(); + } + + return UCB_ADC_DAT(val); +diff --git a/drivers/misc/sgi-xp/xpc_channel.c b/drivers/misc/sgi-xp/xpc_channel.c +index 8e6607fc8a67..b9ab770bbdb5 100644 +--- a/drivers/misc/sgi-xp/xpc_channel.c ++++ b/drivers/misc/sgi-xp/xpc_channel.c +@@ -834,7 +834,7 @@ xpc_allocate_msg_wait(struct xpc_channel *ch) + + atomic_inc(&ch->n_on_msg_allocate_wq); + prepare_to_wait(&ch->msg_allocate_wq, &wait, TASK_INTERRUPTIBLE); +- ret = schedule_timeout(1); ++ ret = schedule_min_hrtimeout(); + finish_wait(&ch->msg_allocate_wq, &wait); + atomic_dec(&ch->n_on_msg_allocate_wq); + +diff --git a/drivers/net/caif/caif_hsi.c b/drivers/net/caif/caif_hsi.c +index bbb2575d4728..637757144221 100644 +--- a/drivers/net/caif/caif_hsi.c ++++ b/drivers/net/caif/caif_hsi.c +@@ -939,7 +939,7 @@ static void cfhsi_wake_down(struct work_struct *work) + break; + + set_current_state(TASK_INTERRUPTIBLE); +- schedule_timeout(1); ++ schedule_min_hrtimeout(); + retry--; + } + +diff --git a/drivers/net/can/usb/peak_usb/pcan_usb.c b/drivers/net/can/usb/peak_usb/pcan_usb.c +index d2539c95adb6..0c2f31a03ce9 100644 +--- a/drivers/net/can/usb/peak_usb/pcan_usb.c ++++ b/drivers/net/can/usb/peak_usb/pcan_usb.c +@@ -242,7 +242,7 @@ static int pcan_usb_write_mode(struct peak_usb_device *dev, u8 onoff) + } else { + /* the PCAN-USB needs time to init */ + set_current_state(TASK_INTERRUPTIBLE); +- schedule_timeout(msecs_to_jiffies(PCAN_USB_STARTUP_TIMEOUT)); ++ schedule_msec_hrtimeout((PCAN_USB_STARTUP_TIMEOUT)); + } + + return err; +diff --git a/drivers/net/usb/lan78xx.c b/drivers/net/usb/lan78xx.c +index c2a58f05b9a1..6ef94ef9b18e 100644 +--- a/drivers/net/usb/lan78xx.c ++++ b/drivers/net/usb/lan78xx.c +@@ -2678,7 +2678,7 @@ static void lan78xx_terminate_urbs(struct lan78xx_net *dev) + while (!skb_queue_empty(&dev->rxq) && + !skb_queue_empty(&dev->txq) && + !skb_queue_empty(&dev->done)) { +- schedule_timeout(msecs_to_jiffies(UNLINK_TIMEOUT_MS)); ++ schedule_msec_hrtimeout((UNLINK_TIMEOUT_MS)); + set_current_state(TASK_UNINTERRUPTIBLE); + netif_dbg(dev, ifdown, dev->net, + "waited for %d urb completions\n", temp); +diff --git a/drivers/net/usb/usbnet.c b/drivers/net/usb/usbnet.c +index 9ce6d30576dd..e53d42786ac9 100644 +--- a/drivers/net/usb/usbnet.c ++++ b/drivers/net/usb/usbnet.c +@@ -767,7 +767,7 @@ static void wait_skb_queue_empty(struct sk_buff_head *q) + spin_lock_irqsave(&q->lock, flags); + while (!skb_queue_empty(q)) { + spin_unlock_irqrestore(&q->lock, flags); +- schedule_timeout(msecs_to_jiffies(UNLINK_TIMEOUT_MS)); ++ schedule_msec_hrtimeout((UNLINK_TIMEOUT_MS)); + set_current_state(TASK_UNINTERRUPTIBLE); + spin_lock_irqsave(&q->lock, flags); + } +diff --git a/drivers/net/wireless/intel/ipw2x00/ipw2100.c b/drivers/net/wireless/intel/ipw2x00/ipw2100.c +index c4c83ab60cbc..2be961a750b8 100644 +--- a/drivers/net/wireless/intel/ipw2x00/ipw2100.c ++++ b/drivers/net/wireless/intel/ipw2x00/ipw2100.c +@@ -816,7 +816,7 @@ static int ipw2100_hw_send_command(struct ipw2100_priv *priv, + * doesn't seem to have as many firmware restart cycles... + * + * As a test, we're sticking in a 1/100s delay here */ +- schedule_timeout_uninterruptible(msecs_to_jiffies(10)); ++ schedule_msec_hrtimeout_uninterruptible((10)); + + return 0; + +@@ -1267,7 +1267,7 @@ static int ipw2100_start_adapter(struct ipw2100_priv *priv) + IPW_DEBUG_FW("Waiting for f/w initialization to complete...\n"); + i = 5000; + do { +- schedule_timeout_uninterruptible(msecs_to_jiffies(40)); ++ schedule_msec_hrtimeout_uninterruptible((40)); + /* Todo... wait for sync command ... */ + + read_register(priv->net_dev, IPW_REG_INTA, &inta); +diff --git a/drivers/parport/ieee1284.c b/drivers/parport/ieee1284.c +index 90fb73575495..c94048b048a5 100644 +--- a/drivers/parport/ieee1284.c ++++ b/drivers/parport/ieee1284.c +@@ -208,7 +208,7 @@ int parport_wait_peripheral(struct parport *port, + /* parport_wait_event didn't time out, but the + * peripheral wasn't actually ready either. + * Wait for another 10ms. */ +- schedule_timeout_interruptible(msecs_to_jiffies(10)); ++ schedule_msec_hrtimeout_interruptible((10)); + } + } + +diff --git a/drivers/parport/ieee1284_ops.c b/drivers/parport/ieee1284_ops.c +index 5d41dda6da4e..34705f6b423f 100644 +--- a/drivers/parport/ieee1284_ops.c ++++ b/drivers/parport/ieee1284_ops.c +@@ -537,7 +537,7 @@ size_t parport_ieee1284_ecp_read_data (struct parport *port, + /* Yield the port for a while. */ + if (count && dev->port->irq != PARPORT_IRQ_NONE) { + parport_release (dev); +- schedule_timeout_interruptible(msecs_to_jiffies(40)); ++ schedule_msec_hrtimeout_interruptible((40)); + parport_claim_or_block (dev); + } + else +diff --git a/drivers/platform/x86/intel_ips.c b/drivers/platform/x86/intel_ips.c +index bffe548187ee..c2918ee3e100 100644 +--- a/drivers/platform/x86/intel_ips.c ++++ b/drivers/platform/x86/intel_ips.c +@@ -798,7 +798,7 @@ static int ips_adjust(void *data) + ips_gpu_lower(ips); + + sleep: +- schedule_timeout_interruptible(msecs_to_jiffies(IPS_ADJUST_PERIOD)); ++ schedule_msec_hrtimeout_interruptible((IPS_ADJUST_PERIOD)); + } while (!kthread_should_stop()); + + dev_dbg(ips->dev, "ips-adjust thread stopped\n"); +@@ -974,7 +974,7 @@ static int ips_monitor(void *data) + seqno_timestamp = get_jiffies_64(); + + old_cpu_power = thm_readl(THM_CEC); +- schedule_timeout_interruptible(msecs_to_jiffies(IPS_SAMPLE_PERIOD)); ++ schedule_msec_hrtimeout_interruptible((IPS_SAMPLE_PERIOD)); + + /* Collect an initial average */ + for (i = 0; i < IPS_SAMPLE_COUNT; i++) { +@@ -1001,7 +1001,7 @@ static int ips_monitor(void *data) + mchp_samples[i] = mchp; + } + +- schedule_timeout_interruptible(msecs_to_jiffies(IPS_SAMPLE_PERIOD)); ++ schedule_msec_hrtimeout_interruptible((IPS_SAMPLE_PERIOD)); + if (kthread_should_stop()) + break; + } +@@ -1028,7 +1028,7 @@ static int ips_monitor(void *data) + * us to reduce the sample frequency if the CPU and GPU are idle. + */ + old_cpu_power = thm_readl(THM_CEC); +- schedule_timeout_interruptible(msecs_to_jiffies(IPS_SAMPLE_PERIOD)); ++ schedule_msec_hrtimeout_interruptible((IPS_SAMPLE_PERIOD)); + last_sample_period = IPS_SAMPLE_PERIOD; + + timer_setup(&ips->timer, monitor_timeout, TIMER_DEFERRABLE); +diff --git a/drivers/rtc/rtc-wm8350.c b/drivers/rtc/rtc-wm8350.c +index 2018614f258f..fc19b312c345 100644 +--- a/drivers/rtc/rtc-wm8350.c ++++ b/drivers/rtc/rtc-wm8350.c +@@ -114,7 +114,7 @@ static int wm8350_rtc_settime(struct device *dev, struct rtc_time *tm) + /* Wait until confirmation of stopping */ + do { + rtc_ctrl = wm8350_reg_read(wm8350, WM8350_RTC_TIME_CONTROL); +- schedule_timeout_uninterruptible(msecs_to_jiffies(1)); ++ schedule_msec_hrtimeout_uninterruptible((1)); + } while (--retries && !(rtc_ctrl & WM8350_RTC_STS)); + + if (!retries) { +@@ -197,7 +197,7 @@ static int wm8350_rtc_stop_alarm(struct wm8350 *wm8350) + /* Wait until confirmation of stopping */ + do { + rtc_ctrl = wm8350_reg_read(wm8350, WM8350_RTC_TIME_CONTROL); +- schedule_timeout_uninterruptible(msecs_to_jiffies(1)); ++ schedule_msec_hrtimeout_uninterruptible((1)); + } while (retries-- && !(rtc_ctrl & WM8350_RTC_ALMSTS)); + + if (!(rtc_ctrl & WM8350_RTC_ALMSTS)) +@@ -220,7 +220,7 @@ static int wm8350_rtc_start_alarm(struct wm8350 *wm8350) + /* Wait until confirmation */ + do { + rtc_ctrl = wm8350_reg_read(wm8350, WM8350_RTC_TIME_CONTROL); +- schedule_timeout_uninterruptible(msecs_to_jiffies(1)); ++ schedule_msec_hrtimeout_uninterruptible((1)); + } while (retries-- && rtc_ctrl & WM8350_RTC_ALMSTS); + + if (rtc_ctrl & WM8350_RTC_ALMSTS) +diff --git a/drivers/scsi/fnic/fnic_scsi.c b/drivers/scsi/fnic/fnic_scsi.c +index b60795893994..d2d05691dbd2 100644 +--- a/drivers/scsi/fnic/fnic_scsi.c ++++ b/drivers/scsi/fnic/fnic_scsi.c +@@ -216,7 +216,7 @@ int fnic_fw_reset_handler(struct fnic *fnic) + + /* wait for io cmpl */ + while (atomic_read(&fnic->in_flight)) +- schedule_timeout(msecs_to_jiffies(1)); ++ schedule_msec_hrtimeout((1)); + + spin_lock_irqsave(&fnic->wq_copy_lock[0], flags); + +@@ -2277,7 +2277,7 @@ static int fnic_clean_pending_aborts(struct fnic *fnic, + } + } + +- schedule_timeout(msecs_to_jiffies(2 * fnic->config.ed_tov)); ++ schedule_msec_hrtimeout((2 * fnic->config.ed_tov)); + + /* walk again to check, if IOs are still pending in fw */ + if (fnic_is_abts_pending(fnic, lr_sc)) +diff --git a/drivers/scsi/lpfc/lpfc_scsi.c b/drivers/scsi/lpfc/lpfc_scsi.c +index b138d9fee675..63cacc21aa58 100644 +--- a/drivers/scsi/lpfc/lpfc_scsi.c ++++ b/drivers/scsi/lpfc/lpfc_scsi.c +@@ -5181,7 +5181,7 @@ lpfc_reset_flush_io_context(struct lpfc_vport *vport, uint16_t tgt_id, + tgt_id, lun_id, context); + later = msecs_to_jiffies(2 * vport->cfg_devloss_tmo * 1000) + jiffies; + while (time_after(later, jiffies) && cnt) { +- schedule_timeout_uninterruptible(msecs_to_jiffies(20)); ++ schedule_msec_hrtimeout_uninterruptible((20)); + cnt = lpfc_sli_sum_iocb(vport, tgt_id, lun_id, context); + } + if (cnt) { +diff --git a/drivers/scsi/snic/snic_scsi.c b/drivers/scsi/snic/snic_scsi.c +index b3650c989ed4..7ed1fb285754 100644 +--- a/drivers/scsi/snic/snic_scsi.c ++++ b/drivers/scsi/snic/snic_scsi.c +@@ -2353,7 +2353,7 @@ snic_reset(struct Scsi_Host *shost, struct scsi_cmnd *sc) + + /* Wait for all the IOs that are entered in Qcmd */ + while (atomic_read(&snic->ios_inflight)) +- schedule_timeout(msecs_to_jiffies(1)); ++ schedule_msec_hrtimeout((1)); + + ret = snic_issue_hba_reset(snic, sc); + if (ret) { +diff --git a/drivers/staging/comedi/drivers/ni_mio_common.c b/drivers/staging/comedi/drivers/ni_mio_common.c +index f98e3ae27bff..0741c8352a6d 100644 +--- a/drivers/staging/comedi/drivers/ni_mio_common.c ++++ b/drivers/staging/comedi/drivers/ni_mio_common.c +@@ -4742,7 +4742,7 @@ static int cs5529_wait_for_idle(struct comedi_device *dev) + if ((status & NI67XX_CAL_STATUS_BUSY) == 0) + break; + set_current_state(TASK_INTERRUPTIBLE); +- if (schedule_timeout(1)) ++ if (schedule_min_hrtimeout()) + return -EIO; + } + if (i == timeout) { +diff --git a/drivers/staging/rts5208/rtsx.c b/drivers/staging/rts5208/rtsx.c +index cb95ad6fa4f9..d352058e0336 100644 +--- a/drivers/staging/rts5208/rtsx.c ++++ b/drivers/staging/rts5208/rtsx.c +@@ -490,7 +490,7 @@ static int rtsx_polling_thread(void *__dev) + + for (;;) { + set_current_state(TASK_INTERRUPTIBLE); +- schedule_timeout(msecs_to_jiffies(POLLING_INTERVAL)); ++ schedule_msec_hrtimeout((POLLING_INTERVAL)); + + /* lock the device pointers */ + mutex_lock(&dev->dev_mutex); +diff --git a/drivers/staging/speakup/speakup_acntpc.c b/drivers/staging/speakup/speakup_acntpc.c +index c94328a5bd4a..6e7d4671aa69 100644 +--- a/drivers/staging/speakup/speakup_acntpc.c ++++ b/drivers/staging/speakup/speakup_acntpc.c +@@ -198,7 +198,7 @@ static void do_catch_up(struct spk_synth *synth) + full_time_val = full_time->u.n.value; + spin_unlock_irqrestore(&speakup_info.spinlock, flags); + if (synth_full()) { +- schedule_timeout(msecs_to_jiffies(full_time_val)); ++ schedule_msec_hrtimeout((full_time_val)); + continue; + } + set_current_state(TASK_RUNNING); +@@ -226,7 +226,7 @@ static void do_catch_up(struct spk_synth *synth) + jiffy_delta_val = jiffy_delta->u.n.value; + delay_time_val = delay_time->u.n.value; + spin_unlock_irqrestore(&speakup_info.spinlock, flags); +- schedule_timeout(msecs_to_jiffies(delay_time_val)); ++ schedule_msec_hrtimeout(delay_time_val); + jiff_max = jiffies + jiffy_delta_val; + } + } +diff --git a/drivers/staging/speakup/speakup_apollo.c b/drivers/staging/speakup/speakup_apollo.c +index 0877b4044c28..627102d048c1 100644 +--- a/drivers/staging/speakup/speakup_apollo.c ++++ b/drivers/staging/speakup/speakup_apollo.c +@@ -165,7 +165,7 @@ static void do_catch_up(struct spk_synth *synth) + if (!synth->io_ops->synth_out(synth, ch)) { + synth->io_ops->tiocmset(0, UART_MCR_RTS); + synth->io_ops->tiocmset(UART_MCR_RTS, 0); +- schedule_timeout(msecs_to_jiffies(full_time_val)); ++ schedule_msec_hrtimeout(full_time_val); + continue; + } + if (time_after_eq(jiffies, jiff_max) && (ch == SPACE)) { +diff --git a/drivers/staging/speakup/speakup_decext.c b/drivers/staging/speakup/speakup_decext.c +index ddbb7e97d118..f9502addc765 100644 +--- a/drivers/staging/speakup/speakup_decext.c ++++ b/drivers/staging/speakup/speakup_decext.c +@@ -176,7 +176,7 @@ static void do_catch_up(struct spk_synth *synth) + if (ch == '\n') + ch = 0x0D; + if (synth_full() || !synth->io_ops->synth_out(synth, ch)) { +- schedule_timeout(msecs_to_jiffies(delay_time_val)); ++ schedule_msec_hrtimeout(delay_time_val); + continue; + } + set_current_state(TASK_RUNNING); +diff --git a/drivers/staging/speakup/speakup_decpc.c b/drivers/staging/speakup/speakup_decpc.c +index 798c42dfa16c..d85b41db67a3 100644 +--- a/drivers/staging/speakup/speakup_decpc.c ++++ b/drivers/staging/speakup/speakup_decpc.c +@@ -394,7 +394,7 @@ static void do_catch_up(struct spk_synth *synth) + if (ch == '\n') + ch = 0x0D; + if (dt_sendchar(ch)) { +- schedule_timeout(msecs_to_jiffies(delay_time_val)); ++ schedule_msec_hrtimeout((delay_time_val)); + continue; + } + set_current_state(TASK_RUNNING); +diff --git a/drivers/staging/speakup/speakup_dectlk.c b/drivers/staging/speakup/speakup_dectlk.c +index dccb4ea29d37..8ecead307d04 100644 +--- a/drivers/staging/speakup/speakup_dectlk.c ++++ b/drivers/staging/speakup/speakup_dectlk.c +@@ -244,7 +244,7 @@ static void do_catch_up(struct spk_synth *synth) + if (ch == '\n') + ch = 0x0D; + if (synth_full_val || !synth->io_ops->synth_out(synth, ch)) { +- schedule_timeout(msecs_to_jiffies(delay_time_val)); ++ schedule_msec_hrtimeout(delay_time_val); + continue; + } + set_current_state(TASK_RUNNING); +diff --git a/drivers/staging/speakup/speakup_dtlk.c b/drivers/staging/speakup/speakup_dtlk.c +index dbebed0eeeec..6d83c13ca4a6 100644 +--- a/drivers/staging/speakup/speakup_dtlk.c ++++ b/drivers/staging/speakup/speakup_dtlk.c +@@ -211,7 +211,7 @@ static void do_catch_up(struct spk_synth *synth) + delay_time_val = delay_time->u.n.value; + spin_unlock_irqrestore(&speakup_info.spinlock, flags); + if (synth_full()) { +- schedule_timeout(msecs_to_jiffies(delay_time_val)); ++ schedule_msec_hrtimeout((delay_time_val)); + continue; + } + set_current_state(TASK_RUNNING); +@@ -227,7 +227,7 @@ static void do_catch_up(struct spk_synth *synth) + delay_time_val = delay_time->u.n.value; + jiffy_delta_val = jiffy_delta->u.n.value; + spin_unlock_irqrestore(&speakup_info.spinlock, flags); +- schedule_timeout(msecs_to_jiffies(delay_time_val)); ++ schedule_msec_hrtimeout((delay_time_val)); + jiff_max = jiffies + jiffy_delta_val; + } + } +diff --git a/drivers/staging/speakup/speakup_keypc.c b/drivers/staging/speakup/speakup_keypc.c +index 414827e888fc..cb31c9176daa 100644 +--- a/drivers/staging/speakup/speakup_keypc.c ++++ b/drivers/staging/speakup/speakup_keypc.c +@@ -199,7 +199,7 @@ static void do_catch_up(struct spk_synth *synth) + full_time_val = full_time->u.n.value; + spin_unlock_irqrestore(&speakup_info.spinlock, flags); + if (synth_full()) { +- schedule_timeout(msecs_to_jiffies(full_time_val)); ++ schedule_msec_hrtimeout((full_time_val)); + continue; + } + set_current_state(TASK_RUNNING); +@@ -232,7 +232,7 @@ static void do_catch_up(struct spk_synth *synth) + jiffy_delta_val = jiffy_delta->u.n.value; + delay_time_val = delay_time->u.n.value; + spin_unlock_irqrestore(&speakup_info.spinlock, flags); +- schedule_timeout(msecs_to_jiffies(delay_time_val)); ++ schedule_msec_hrtimeout(delay_time_val); + jiff_max = jiffies + jiffy_delta_val; + } + } +diff --git a/drivers/staging/speakup/synth.c b/drivers/staging/speakup/synth.c +index 3568bfb89912..0a80b3b098b2 100644 +--- a/drivers/staging/speakup/synth.c ++++ b/drivers/staging/speakup/synth.c +@@ -93,12 +93,8 @@ static void _spk_do_catch_up(struct spk_synth *synth, int unicode) + spin_unlock_irqrestore(&speakup_info.spinlock, flags); + if (ch == '\n') + ch = synth->procspeech; +- if (unicode) +- ret = synth->io_ops->synth_out_unicode(synth, ch); +- else +- ret = synth->io_ops->synth_out(synth, ch); +- if (!ret) { +- schedule_timeout(msecs_to_jiffies(full_time_val)); ++ if (!synth->io_ops->synth_out(synth, ch)) { ++ schedule_msec_hrtimeout(full_time_val); + continue; + } + if (time_after_eq(jiffies, jiff_max) && (ch == SPACE)) { +@@ -108,11 +104,9 @@ static void _spk_do_catch_up(struct spk_synth *synth, int unicode) + full_time_val = full_time->u.n.value; + spin_unlock_irqrestore(&speakup_info.spinlock, flags); + if (synth->io_ops->synth_out(synth, synth->procspeech)) +- schedule_timeout( +- msecs_to_jiffies(delay_time_val)); ++ schedule_msec_hrtimeout(delay_time_val); + else +- schedule_timeout( +- msecs_to_jiffies(full_time_val)); ++ schedule_msec_hrtimeout(full_time_val); + jiff_max = jiffies + jiffy_delta_val; + } + set_current_state(TASK_RUNNING); +diff --git a/drivers/staging/unisys/visornic/visornic_main.c b/drivers/staging/unisys/visornic/visornic_main.c +index 1d1440d43002..52fe89ae1d9d 100644 +--- a/drivers/staging/unisys/visornic/visornic_main.c ++++ b/drivers/staging/unisys/visornic/visornic_main.c +@@ -549,7 +549,7 @@ static int visornic_disable_with_timeout(struct net_device *netdev, + } + set_current_state(TASK_INTERRUPTIBLE); + spin_unlock_irqrestore(&devdata->priv_lock, flags); +- wait += schedule_timeout(msecs_to_jiffies(10)); ++ wait += schedule_msec_hrtimeout((10)); + spin_lock_irqsave(&devdata->priv_lock, flags); + } + +@@ -560,7 +560,7 @@ static int visornic_disable_with_timeout(struct net_device *netdev, + while (1) { + set_current_state(TASK_INTERRUPTIBLE); + spin_unlock_irqrestore(&devdata->priv_lock, flags); +- schedule_timeout(msecs_to_jiffies(10)); ++ schedule_msec_hrtimeout((10)); + spin_lock_irqsave(&devdata->priv_lock, flags); + if (atomic_read(&devdata->usage)) + break; +@@ -714,7 +714,7 @@ static int visornic_enable_with_timeout(struct net_device *netdev, + } + set_current_state(TASK_INTERRUPTIBLE); + spin_unlock_irqrestore(&devdata->priv_lock, flags); +- wait += schedule_timeout(msecs_to_jiffies(10)); ++ wait += schedule_msec_hrtimeout((10)); + spin_lock_irqsave(&devdata->priv_lock, flags); + } + +diff --git a/drivers/video/fbdev/omap/hwa742.c b/drivers/video/fbdev/omap/hwa742.c +index cfe63932f825..71c00ef772a3 100644 +--- a/drivers/video/fbdev/omap/hwa742.c ++++ b/drivers/video/fbdev/omap/hwa742.c +@@ -913,7 +913,7 @@ static void hwa742_resume(void) + if (hwa742_read_reg(HWA742_PLL_DIV_REG) & (1 << 7)) + break; + set_current_state(TASK_UNINTERRUPTIBLE); +- schedule_timeout(msecs_to_jiffies(5)); ++ schedule_msec_hrtimeout((5)); + } + hwa742_set_update_mode(hwa742.update_mode_before_suspend); + } +diff --git a/drivers/video/fbdev/pxafb.c b/drivers/video/fbdev/pxafb.c +index f70c9f79622e..0b363eaee24f 100644 +--- a/drivers/video/fbdev/pxafb.c ++++ b/drivers/video/fbdev/pxafb.c +@@ -1287,7 +1287,7 @@ static int pxafb_smart_thread(void *arg) + mutex_unlock(&fbi->ctrlr_lock); + + set_current_state(TASK_INTERRUPTIBLE); +- schedule_timeout(msecs_to_jiffies(30)); ++ schedule_msec_hrtimeout((30)); + } + + pr_debug("%s(): task ending\n", __func__); +diff --git a/fs/btrfs/inode-map.c b/fs/btrfs/inode-map.c +index 37345fb6191d..3874c17d1bc5 100644 +--- a/fs/btrfs/inode-map.c ++++ b/fs/btrfs/inode-map.c +@@ -91,7 +91,7 @@ static int caching_kthread(void *data) + btrfs_release_path(path); + root->ino_cache_progress = last; + up_read(&fs_info->commit_root_sem); +- schedule_timeout(1); ++ schedule_min_hrtimeout(); + goto again; + } else + continue; +diff --git a/fs/proc/base.c b/fs/proc/base.c +index ebea9501afb8..51c9346a69fe 100644 +--- a/fs/proc/base.c ++++ b/fs/proc/base.c +@@ -477,7 +477,7 @@ static int proc_pid_schedstat(struct seq_file *m, struct pid_namespace *ns, + seq_puts(m, "0 0 0\n"); + else + seq_printf(m, "%llu %llu %lu\n", +- (unsigned long long)task->se.sum_exec_runtime, ++ (unsigned long long)tsk_seruntime(task), + (unsigned long long)task->sched_info.run_delay, + task->sched_info.pcount); + +diff --git a/include/linux/freezer.h b/include/linux/freezer.h +index 21f5aa0b217f..ee9b46394fdf 100644 +--- a/include/linux/freezer.h ++++ b/include/linux/freezer.h +@@ -297,6 +297,7 @@ static inline void set_freezable(void) {} + #define wait_event_freezekillable_unsafe(wq, condition) \ + wait_event_killable(wq, condition) + ++#define pm_freezing (false) + #endif /* !CONFIG_FREEZER */ + + #endif /* FREEZER_H_INCLUDED */ +diff --git a/include/linux/init_task.h b/include/linux/init_task.h +index 2c620d7ac432..73417df5daa2 100644 +--- a/include/linux/init_task.h ++++ b/include/linux/init_task.h +@@ -36,7 +36,11 @@ extern struct cred init_cred; + #define INIT_PREV_CPUTIME(x) + #endif + ++#ifdef CONFIG_SCHED_MUQSS ++#define INIT_TASK_COMM "MuQSS" ++#else + #define INIT_TASK_COMM "swapper" ++#endif + + /* Attach to the init_task data structure for proper alignment */ + #ifdef CONFIG_ARCH_TASK_STRUCT_ON_STACK +diff --git a/include/linux/ioprio.h b/include/linux/ioprio.h +index e9bfe6972aed..16ba1c7e5bde 100644 +--- a/include/linux/ioprio.h ++++ b/include/linux/ioprio.h +@@ -53,6 +53,8 @@ enum { + */ + static inline int task_nice_ioprio(struct task_struct *task) + { ++ if (iso_task(task)) ++ return 0; + return (task_nice(task) + 20) / 5; + } + +diff --git a/include/linux/sched.h b/include/linux/sched.h +index 716ad1d8d95e..f5fe682bed8a 100644 +--- a/include/linux/sched.h ++++ b/include/linux/sched.h +@@ -31,6 +31,9 @@ + #include + #include + #include ++#ifdef CONFIG_SCHED_MUQSS ++#include ++#endif + + /* task_struct member predeclarations (sorted alphabetically): */ + struct audit_context; +@@ -214,13 +217,40 @@ struct task_group; + + extern void scheduler_tick(void); + +-#define MAX_SCHEDULE_TIMEOUT LONG_MAX +- ++#define MAX_SCHEDULE_TIMEOUT LONG_MAX + extern long schedule_timeout(long timeout); + extern long schedule_timeout_interruptible(long timeout); + extern long schedule_timeout_killable(long timeout); + extern long schedule_timeout_uninterruptible(long timeout); + extern long schedule_timeout_idle(long timeout); ++ ++#ifdef CONFIG_HIGH_RES_TIMERS ++extern long schedule_msec_hrtimeout(long timeout); ++extern long schedule_min_hrtimeout(void); ++extern long schedule_msec_hrtimeout_interruptible(long timeout); ++extern long schedule_msec_hrtimeout_uninterruptible(long timeout); ++#else ++static inline long schedule_msec_hrtimeout(long timeout) ++{ ++ return schedule_timeout(msecs_to_jiffies(timeout)); ++} ++ ++static inline long schedule_min_hrtimeout(void) ++{ ++ return schedule_timeout(1); ++} ++ ++static inline long schedule_msec_hrtimeout_interruptible(long timeout) ++{ ++ return schedule_timeout_interruptible(msecs_to_jiffies(timeout)); ++} ++ ++static inline long schedule_msec_hrtimeout_uninterruptible(long timeout) ++{ ++ return schedule_timeout_uninterruptible(msecs_to_jiffies(timeout)); ++} ++#endif ++ + asmlinkage void schedule(void); + extern void schedule_preempt_disabled(void); + asmlinkage void preempt_schedule_irq(void); +@@ -649,9 +679,11 @@ struct task_struct { + unsigned int flags; + unsigned int ptrace; + ++#if defined(CONFIG_SMP) || defined(CONFIG_SCHED_MUQSS) ++ int on_cpu; ++#endif + #ifdef CONFIG_SMP + struct llist_node wake_entry; +- int on_cpu; + #ifdef CONFIG_THREAD_INFO_IN_TASK + /* Current CPU: */ + unsigned int cpu; +@@ -676,10 +708,25 @@ struct task_struct { + int static_prio; + int normal_prio; + unsigned int rt_priority; ++#ifdef CONFIG_SCHED_MUQSS ++ int time_slice; ++ u64 deadline; ++ skiplist_node node; /* Skip list node */ ++ u64 last_ran; ++ u64 sched_time; /* sched_clock time spent running */ ++#ifdef CONFIG_SMT_NICE ++ int smt_bias; /* Policy/nice level bias across smt siblings */ ++#endif ++#ifdef CONFIG_HOTPLUG_CPU ++ bool zerobound; /* Bound to CPU0 for hotplug */ ++#endif ++ unsigned long rt_timeout; ++#else /* CONFIG_SCHED_MUQSS */ + + const struct sched_class *sched_class; + struct sched_entity se; + struct sched_rt_entity rt; ++#endif + #ifdef CONFIG_CGROUP_SCHED + struct task_group *sched_task_group; + #endif +@@ -844,6 +891,10 @@ struct task_struct { + #ifdef CONFIG_ARCH_HAS_SCALED_CPUTIME + u64 utimescaled; + u64 stimescaled; ++#endif ++#ifdef CONFIG_SCHED_MUQSS ++ /* Unbanked cpu time */ ++ unsigned long utime_ns, stime_ns; + #endif + u64 gtime; + struct prev_cputime prev_cputime; +@@ -1298,6 +1349,40 @@ struct task_struct { + */ + }; + ++#ifdef CONFIG_SCHED_MUQSS ++#define tsk_seruntime(t) ((t)->sched_time) ++#define tsk_rttimeout(t) ((t)->rt_timeout) ++ ++static inline void tsk_cpus_current(struct task_struct *p) ++{ ++} ++ ++void print_scheduler_version(void); ++ ++static inline bool iso_task(struct task_struct *p) ++{ ++ return (p->policy == SCHED_ISO); ++} ++#else /* CFS */ ++#define tsk_seruntime(t) ((t)->se.sum_exec_runtime) ++#define tsk_rttimeout(t) ((t)->rt.timeout) ++ ++static inline void tsk_cpus_current(struct task_struct *p) ++{ ++ p->nr_cpus_allowed = current->nr_cpus_allowed; ++} ++ ++static inline void print_scheduler_version(void) ++{ ++ printk(KERN_INFO "CFS CPU scheduler.\n"); ++} ++ ++static inline bool iso_task(struct task_struct *p) ++{ ++ return false; ++} ++#endif /* CONFIG_SCHED_MUQSS */ ++ + static inline struct pid *task_pid(struct task_struct *task) + { + return task->thread_pid; +diff --git a/include/linux/sched/deadline.h b/include/linux/sched/deadline.h +index 1aff00b65f3c..73d6319a856a 100644 +--- a/include/linux/sched/deadline.h ++++ b/include/linux/sched/deadline.h +@@ -28,7 +28,16 @@ static inline bool dl_time_before(u64 a, u64 b) + #ifdef CONFIG_SMP + + struct root_domain; ++#ifdef CONFIG_SCHED_MUQSS ++static inline void dl_clear_root_domain(struct root_domain *rd) ++{ ++} ++static inline void dl_add_task_root_domain(struct task_struct *p) ++{ ++} ++#else /* CONFIG_SCHED_MUQSS */ + extern void dl_add_task_root_domain(struct task_struct *p); + extern void dl_clear_root_domain(struct root_domain *rd); ++#endif /* CONFIG_SCHED_MUQSS */ + + #endif /* CONFIG_SMP */ +diff --git a/include/linux/sched/nohz.h b/include/linux/sched/nohz.h +index 1abe91ff6e4a..20ba383562b0 100644 +--- a/include/linux/sched/nohz.h ++++ b/include/linux/sched/nohz.h +@@ -13,7 +13,7 @@ extern int get_nohz_timer_target(void); + static inline void nohz_balance_enter_idle(int cpu) { } + #endif + +-#ifdef CONFIG_NO_HZ_COMMON ++#if defined(CONFIG_NO_HZ_COMMON) && !defined(CONFIG_SCHED_MUQSS) + void calc_load_nohz_start(void); + void calc_load_nohz_stop(void); + #else +diff --git a/include/linux/sched/prio.h b/include/linux/sched/prio.h +index 7d64feafc408..43c9d9e50c09 100644 +--- a/include/linux/sched/prio.h ++++ b/include/linux/sched/prio.h +@@ -20,8 +20,20 @@ + */ + + #define MAX_USER_RT_PRIO 100 ++ ++#ifdef CONFIG_SCHED_MUQSS ++/* Note different MAX_RT_PRIO */ ++#define MAX_RT_PRIO (MAX_USER_RT_PRIO + 1) ++ ++#define ISO_PRIO (MAX_RT_PRIO) ++#define NORMAL_PRIO (MAX_RT_PRIO + 1) ++#define IDLE_PRIO (MAX_RT_PRIO + 2) ++#define PRIO_LIMIT ((IDLE_PRIO) + 1) ++#else /* CONFIG_SCHED_MUQSS */ + #define MAX_RT_PRIO MAX_USER_RT_PRIO + ++#endif /* CONFIG_SCHED_MUQSS */ ++ + #define MAX_PRIO (MAX_RT_PRIO + NICE_WIDTH) + #define DEFAULT_PRIO (MAX_RT_PRIO + NICE_WIDTH / 2) + +diff --git a/include/linux/sched/rt.h b/include/linux/sched/rt.h +index e5af028c08b4..010b2244e0b6 100644 +--- a/include/linux/sched/rt.h ++++ b/include/linux/sched/rt.h +@@ -24,8 +24,10 @@ static inline bool task_is_realtime(struct task_struct *tsk) + + if (policy == SCHED_FIFO || policy == SCHED_RR) + return true; ++#ifndef CONFIG_SCHED_MUQSS + if (policy == SCHED_DEADLINE) + return true; ++#endif + return false; + } + +diff --git a/include/linux/sched/task.h b/include/linux/sched/task.h +index f1879884238e..2326bea2d6e7 100644 +--- a/include/linux/sched/task.h ++++ b/include/linux/sched/task.h +@@ -102,7 +102,7 @@ extern long kernel_wait4(pid_t, int __user *, int, struct rusage *); + extern void free_task(struct task_struct *tsk); + + /* sched_exec is called by processes performing an exec */ +-#ifdef CONFIG_SMP ++#if defined(CONFIG_SMP) && !defined(CONFIG_SCHED_MUQSS) + extern void sched_exec(void); + #else + #define sched_exec() {} +diff --git a/include/linux/skip_list.h b/include/linux/skip_list.h +new file mode 100644 +index 000000000000..d4be84ba273b +--- /dev/null ++++ b/include/linux/skip_list.h +@@ -0,0 +1,33 @@ ++#ifndef _LINUX_SKIP_LISTS_H ++#define _LINUX_SKIP_LISTS_H ++typedef u64 keyType; ++typedef void *valueType; ++ ++typedef struct nodeStructure skiplist_node; ++ ++struct nodeStructure { ++ int level; /* Levels in this structure */ ++ keyType key; ++ valueType value; ++ skiplist_node *next[8]; ++ skiplist_node *prev[8]; ++}; ++ ++typedef struct listStructure { ++ int entries; ++ int level; /* Maximum level of the list ++ (1 more than the number of levels in the list) */ ++ skiplist_node *header; /* pointer to header */ ++} skiplist; ++ ++void skiplist_init(skiplist_node *slnode); ++skiplist *new_skiplist(skiplist_node *slnode); ++void free_skiplist(skiplist *l); ++void skiplist_node_init(skiplist_node *node); ++void skiplist_insert(skiplist *l, skiplist_node *node, keyType key, valueType value, unsigned int randseed); ++void skiplist_delete(skiplist *l, skiplist_node *node); ++ ++static inline bool skiplist_node_empty(skiplist_node *node) { ++ return (!node->next[0]); ++} ++#endif /* _LINUX_SKIP_LISTS_H */ +diff --git a/include/uapi/linux/sched.h b/include/uapi/linux/sched.h +index 4a0217832464..2a7f7f3695f7 100644 +--- a/include/uapi/linux/sched.h ++++ b/include/uapi/linux/sched.h +@@ -104,9 +104,16 @@ struct clone_args { + #define SCHED_FIFO 1 + #define SCHED_RR 2 + #define SCHED_BATCH 3 +-/* SCHED_ISO: reserved but not implemented yet */ ++/* SCHED_ISO: Implemented on MuQSS only */ + #define SCHED_IDLE 5 ++#ifdef CONFIG_SCHED_MUQSS ++#define SCHED_ISO 4 ++#define SCHED_IDLEPRIO SCHED_IDLE ++#define SCHED_MAX (SCHED_IDLEPRIO) ++#define SCHED_RANGE(policy) ((policy) <= SCHED_MAX) ++#else /* CONFIG_SCHED_MUQSS */ + #define SCHED_DEADLINE 6 ++#endif /* CONFIG_SCHED_MUQSS */ + + /* Can be ORed in to make sure the process is reverted back to SCHED_NORMAL on fork */ + #define SCHED_RESET_ON_FORK 0x40000000 +diff --git a/init/Kconfig b/init/Kconfig +index a34064a031a5..dad51df6237f 100644 +--- a/init/Kconfig ++++ b/init/Kconfig +@@ -73,6 +73,18 @@ config THREAD_INFO_IN_TASK + + menu "General setup" + ++config SCHED_MUQSS ++ bool "MuQSS cpu scheduler" ++ select HIGH_RES_TIMERS ++ ---help--- ++ The Multiple Queue Skiplist Scheduler for excellent interactivity and ++ responsiveness on the desktop and highly scalable deterministic ++ low latency on any hardware. ++ ++ Say Y here. ++ default y ++ ++ + config BROKEN + bool + +@@ -786,6 +798,7 @@ config NUMA_BALANCING + depends on ARCH_SUPPORTS_NUMA_BALANCING + depends on !ARCH_WANT_NUMA_VARIABLE_LOCALITY + depends on SMP && NUMA && MIGRATION ++ depends on !SCHED_MUQSS + help + This option adds support for automatic NUMA aware memory/task placement. + The mechanism is quite primitive and is based on migrating memory when +@@ -885,9 +898,13 @@ menuconfig CGROUP_SCHED + help + This feature lets CPU scheduler recognize task groups and control CPU + bandwidth allocation to such task groups. It uses cgroups to group +- tasks. ++ tasks. In combination with MuQSS this is purely a STUB to create the ++ files associated with the CPU controller cgroup but most of the ++ controls do nothing. This is useful for working in environments and ++ with applications that will only work if this control group is ++ present. + +-if CGROUP_SCHED ++if CGROUP_SCHED && !SCHED_MUQSS + config FAIR_GROUP_SCHED + bool "Group scheduling for SCHED_OTHER" + depends on CGROUP_SCHED +@@ -1016,6 +1033,7 @@ config CGROUP_DEVICE + + config CGROUP_CPUACCT + bool "Simple CPU accounting controller" ++ depends on !SCHED_MUQSS + help + Provides a simple controller for monitoring the + total CPU consumed by the tasks in a cgroup. +@@ -1134,6 +1152,7 @@ config CHECKPOINT_RESTORE + + config SCHED_AUTOGROUP + bool "Automatic process group scheduling" ++ depends on !SCHED_MUQSS + select CGROUPS + select CGROUP_SCHED + select FAIR_GROUP_SCHED +diff --git a/init/init_task.c b/init/init_task.c +index 9e5cbe5eab7b..5c2bcbf25add 100644 +--- a/init/init_task.c ++++ b/init/init_task.c +@@ -66,9 +66,17 @@ struct task_struct init_task + .stack = init_stack, + .usage = REFCOUNT_INIT(2), + .flags = PF_KTHREAD, ++#ifdef CONFIG_SCHED_MUQSS ++ .prio = NORMAL_PRIO, ++ .static_prio = MAX_PRIO - 20, ++ .normal_prio = NORMAL_PRIO, ++ .deadline = 0, ++ .time_slice = 1000000, ++#else + .prio = MAX_PRIO - 20, + .static_prio = MAX_PRIO - 20, + .normal_prio = MAX_PRIO - 20, ++#endif + .policy = SCHED_NORMAL, + .cpus_ptr = &init_task.cpus_mask, + .cpus_mask = CPU_MASK_ALL, +@@ -78,6 +86,7 @@ struct task_struct init_task + .restart_block = { + .fn = do_no_restart_syscall, + }, ++#ifndef CONFIG_SCHED_MUQSS + .se = { + .group_node = LIST_HEAD_INIT(init_task.se.group_node), + }, +@@ -85,6 +94,7 @@ struct task_struct init_task + .run_list = LIST_HEAD_INIT(init_task.rt.run_list), + .time_slice = RR_TIMESLICE, + }, ++#endif + .tasks = LIST_HEAD_INIT(init_task.tasks), + #ifdef CONFIG_SMP + .pushable_tasks = PLIST_NODE_INIT(init_task.pushable_tasks, MAX_PRIO), +diff --git a/init/main.c b/init/main.c +index da1bc0b60a7d..2b323af46d7c 100644 +--- a/init/main.c ++++ b/init/main.c +@@ -1125,6 +1125,8 @@ static int __ref kernel_init(void *unused) + + rcu_end_inkernel_boot(); + ++ print_scheduler_version(); ++ + if (ramdisk_execute_command) { + ret = run_init_process(ramdisk_execute_command); + if (!ret) +diff --git a/kernel/Kconfig.MuQSS b/kernel/Kconfig.MuQSS +new file mode 100644 +index 000000000000..a6a58781ef91 +--- /dev/null ++++ b/kernel/Kconfig.MuQSS +@@ -0,0 +1,105 @@ ++choice ++ prompt "CPU scheduler runqueue sharing" ++ default RQ_MC if SCHED_MUQSS ++ default RQ_NONE ++ ++config RQ_NONE ++ bool "No sharing" ++ help ++ This is the default behaviour where the CPU scheduler has one runqueue ++ per CPU, whether it is a physical or logical CPU (hyperthread). ++ ++ This can still be enabled runtime with the boot parameter ++ rqshare=none ++ ++ If unsure, say N. ++ ++config RQ_SMT ++ bool "SMT (hyperthread) siblings" ++ depends on SCHED_SMT && SCHED_MUQSS ++ ++ help ++ With this option enabled, the CPU scheduler will have one runqueue ++ shared by SMT (hyperthread) siblings. As these logical cores share ++ one physical core, sharing the runqueue resource can lead to decreased ++ overhead, lower latency and higher throughput. ++ ++ This can still be enabled runtime with the boot parameter ++ rqshare=smt ++ ++ If unsure, say N. ++ ++config RQ_MC ++ bool "Multicore siblings" ++ depends on SCHED_MC && SCHED_MUQSS ++ help ++ With this option enabled, the CPU scheduler will have one runqueue ++ shared by multicore siblings in addition to any SMT siblings. ++ As these physical cores share caches, sharing the runqueue resource ++ will lead to lower latency, but its effects on overhead and throughput ++ are less predictable. As a general rule, 6 or fewer cores will likely ++ benefit from this, while larger CPUs will only derive a latency ++ benefit. If your workloads are primarily single threaded, this will ++ possibly worsen throughput. If you are only concerned about latency ++ then enable this regardless of how many cores you have. ++ ++ This can still be enabled runtime with the boot parameter ++ rqshare=mc ++ ++ If unsure, say Y. ++ ++config RQ_MC_LLC ++ bool "Multicore siblings (LLC)" ++ depends on SCHED_MC && SCHED_MUQSS ++ help ++ With this option enabled, the CPU scheduler will behave similarly as ++ with "Multicore siblings". ++ This option takes LLC cache into account when scheduling tasks. ++ Option may benefit CPUs with multiple LLC caches, such as Ryzen ++ and Xeon CPUs. ++ ++ This can still be enabled runtime with the boot parameter ++ rqshare=llc ++ ++ If unsure, say N. ++ ++config RQ_SMP ++ bool "Symmetric Multi-Processing" ++ depends on SMP && SCHED_MUQSS ++ help ++ With this option enabled, the CPU scheduler will have one runqueue ++ shared by all physical CPUs unless they are on separate NUMA nodes. ++ As physical CPUs usually do not share resources, sharing the runqueue ++ will normally worsen throughput but improve latency. If you only ++ care about latency enable this. ++ ++ This can still be enabled runtime with the boot parameter ++ rqshare=smp ++ ++ If unsure, say N. ++ ++config RQ_ALL ++ bool "NUMA" ++ depends on SMP && SCHED_MUQSS ++ help ++ With this option enabled, the CPU scheduler will have one runqueue ++ regardless of the architecture configuration, including across NUMA ++ nodes. This can substantially decrease throughput in NUMA ++ configurations, but light NUMA designs will not be dramatically ++ affected. This option should only be chosen if latency is the prime ++ concern. ++ ++ This can still be enabled runtime with the boot parameter ++ rqshare=all ++ ++ If unsure, say N. ++endchoice ++ ++config SHARERQ ++ int ++ default 0 if RQ_NONE ++ default 1 if RQ_SMT ++ default 2 if RQ_MC ++ default 3 if RQ_MC_LLC ++ default 4 if RQ_SMP ++ default 5 if RQ_ALL +diff --git a/kernel/Kconfig.hz b/kernel/Kconfig.hz +index 38ef6d06888e..89ed751ac4e4 100644 +--- a/kernel/Kconfig.hz ++++ b/kernel/Kconfig.hz +@@ -5,7 +5,8 @@ + + choice + prompt "Timer frequency" +- default HZ_250 ++ default HZ_100 if SCHED_MUQSS ++ default HZ_250_NODEF if !SCHED_MUQSS + help + Allows the configuration of the timer frequency. It is customary + to have the timer interrupt run at 1000 Hz but 100 Hz may be more +@@ -20,11 +21,18 @@ choice + config HZ_100 + bool "100 HZ" + help ++ 100 Hz is a suitable choice in combination with MuQSS which does ++ not rely on ticks for rescheduling interrupts, and is not Hz limited ++ for timeouts and sleeps from both the kernel and userspace. ++ This allows us to benefit from the lower overhead and higher ++ throughput of fewer timer ticks. ++ ++ Non-MuQSS kernels: + 100 Hz is a typical choice for servers, SMP and NUMA systems + with lots of processors that may show reduced performance if + too many timer interrupts are occurring. + +- config HZ_250 ++ config HZ_250_NODEF + bool "250 HZ" + help + 250 Hz is a good compromise choice allowing server performance +@@ -32,7 +40,10 @@ choice + on SMP and NUMA systems. If you are going to be using NTSC video + or multimedia, selected 300Hz instead. + +- config HZ_300 ++ 250 Hz is the default choice for the mainline scheduler but not ++ advantageous in combination with MuQSS. ++ ++ config HZ_300_NODEF + bool "300 HZ" + help + 300 Hz is a good compromise choice allowing server performance +@@ -40,7 +51,7 @@ choice + on SMP and NUMA systems and exactly dividing by both PAL and + NTSC frame rates for video and multimedia work. + +- config HZ_1000 ++ config HZ_1000_NODEF + bool "1000 HZ" + help + 1000 Hz is the preferred choice for desktop systems and other +@@ -51,9 +62,9 @@ endchoice + config HZ + int + default 100 if HZ_100 +- default 250 if HZ_250 +- default 300 if HZ_300 +- default 1000 if HZ_1000 ++ default 250 if HZ_250_NODEF ++ default 300 if HZ_300_NODEF ++ default 1000 if HZ_1000_NODEF + + config SCHED_HRTICK + def_bool HIGH_RES_TIMERS +diff --git a/kernel/Kconfig.preempt b/kernel/Kconfig.preempt +index bf82259cff96..d9438eb6f91c 100644 +--- a/kernel/Kconfig.preempt ++++ b/kernel/Kconfig.preempt +@@ -2,7 +2,7 @@ + + choice + prompt "Preemption Model" +- default PREEMPT_NONE ++ default PREEMPT + + config PREEMPT_NONE + bool "No Forced Preemption (Server)" +@@ -18,7 +18,7 @@ config PREEMPT_NONE + latencies. + + config PREEMPT_VOLUNTARY +- bool "Voluntary Kernel Preemption (Desktop)" ++ bool "Voluntary Kernel Preemption (Nothing)" + depends on !ARCH_NO_PREEMPT + help + This option reduces the latency of the kernel by adding more +@@ -33,7 +33,8 @@ config PREEMPT_VOLUNTARY + applications to run more 'smoothly' even when the system is + under load. + +- Select this if you are building a kernel for a desktop system. ++ Select this for no system in particular (choose Preemptible ++ instead on a desktop if you know what's good for you). + + config PREEMPT + bool "Preemptible Kernel (Low-Latency Desktop)" +diff --git a/kernel/Makefile b/kernel/Makefile +index f2cc0d118a0b..7c18c3eddd9b 100644 +--- a/kernel/Makefile ++++ b/kernel/Makefile +@@ -10,7 +10,7 @@ obj-y = fork.o exec_domain.o panic.o \ + extable.o params.o \ + kthread.o sys_ni.o nsproxy.o \ + notifier.o ksysfs.o cred.o reboot.o \ +- async.o range.o smpboot.o ucount.o ++ async.o range.o smpboot.o ucount.o skip_list.o + + obj-$(CONFIG_MODULES) += kmod.o + obj-$(CONFIG_MULTIUSER) += groups.o +diff --git a/kernel/delayacct.c b/kernel/delayacct.c +index 27725754ac99..769d773c7182 100644 +--- a/kernel/delayacct.c ++++ b/kernel/delayacct.c +@@ -106,7 +106,7 @@ int __delayacct_add_tsk(struct taskstats *d, struct task_struct *tsk) + */ + t1 = tsk->sched_info.pcount; + t2 = tsk->sched_info.run_delay; +- t3 = tsk->se.sum_exec_runtime; ++ t3 = tsk_seruntime(tsk); + + d->cpu_count += t1; + +diff --git a/kernel/exit.c b/kernel/exit.c +index 2833ffb0c211..37a1f8d73eee 100644 +--- a/kernel/exit.c ++++ b/kernel/exit.c +@@ -131,7 +131,7 @@ static void __exit_signal(struct task_struct *tsk) + sig->curr_target = next_thread(tsk); + } + +- add_device_randomness((const void*) &tsk->se.sum_exec_runtime, ++ add_device_randomness((const void*) &tsk_seruntime(tsk), + sizeof(unsigned long long)); + + /* +@@ -152,7 +152,7 @@ static void __exit_signal(struct task_struct *tsk) + sig->inblock += task_io_get_inblock(tsk); + sig->oublock += task_io_get_oublock(tsk); + task_io_accounting_add(&sig->ioac, &tsk->ioac); +- sig->sum_sched_runtime += tsk->se.sum_exec_runtime; ++ sig->sum_sched_runtime += tsk_seruntime(tsk); + sig->nr_threads--; + __unhash_process(tsk, group_dead); + write_sequnlock(&sig->stats_lock); +diff --git a/kernel/irq/Kconfig b/kernel/irq/Kconfig +index f92d9a687372..d17db0ff775f 100644 +--- a/kernel/irq/Kconfig ++++ b/kernel/irq/Kconfig +@@ -111,6 +111,23 @@ config GENERIC_IRQ_RESERVATION_MODE + config IRQ_FORCED_THREADING + bool + ++config FORCE_IRQ_THREADING ++ bool "Make IRQ threading compulsory" ++ depends on IRQ_FORCED_THREADING ++ default n ++ ---help--- ++ ++ Make IRQ threading mandatory for any IRQ handlers that support it ++ instead of being optional and requiring the threadirqs kernel ++ parameter. Instead they can be optionally disabled with the ++ nothreadirqs kernel parameter. ++ ++ Enabling this may make some architectures not boot with runqueue ++ sharing and MuQSS. ++ ++ Enable if you are building for a desktop or low latency system, ++ otherwise say N. ++ + config SPARSE_IRQ + bool "Support sparse irq numbering" if MAY_HAVE_SPARSE_IRQ + ---help--- +diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c +index 1753486b440c..f43423737493 100644 +--- a/kernel/irq/manage.c ++++ b/kernel/irq/manage.c +@@ -24,9 +24,20 @@ + #include "internals.h" + + #if defined(CONFIG_IRQ_FORCED_THREADING) && !defined(CONFIG_PREEMPT_RT) ++#ifdef CONFIG_FORCE_IRQ_THREADING ++__read_mostly bool force_irqthreads = true; ++#else + __read_mostly bool force_irqthreads; ++#endif + EXPORT_SYMBOL_GPL(force_irqthreads); + ++static int __init setup_noforced_irqthreads(char *arg) ++{ ++ force_irqthreads = false; ++ return 0; ++} ++early_param("nothreadirqs", setup_noforced_irqthreads); ++ + static int __init setup_forced_irqthreads(char *arg) + { + force_irqthreads = true; +diff --git a/kernel/kthread.c b/kernel/kthread.c +index b262f47046ca..9797ad652268 100644 +--- a/kernel/kthread.c ++++ b/kernel/kthread.c +@@ -433,6 +433,34 @@ void kthread_bind(struct task_struct *p, unsigned int cpu) + } + EXPORT_SYMBOL(kthread_bind); + ++#if defined(CONFIG_SCHED_MUQSS) && defined(CONFIG_SMP) ++extern void __do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask); ++ ++/* ++ * new_kthread_bind is a special variant of __kthread_bind_mask. ++ * For new threads to work on muqss we want to call do_set_cpus_allowed ++ * without the task_cpu being set and the task rescheduled until they're ++ * rescheduled on their own so we call __do_set_cpus_allowed directly which ++ * only changes the cpumask. This is particularly important for smpboot threads ++ * to work. ++ */ ++static void new_kthread_bind(struct task_struct *p, unsigned int cpu) ++{ ++ unsigned long flags; ++ ++ if (WARN_ON(!wait_task_inactive(p, TASK_UNINTERRUPTIBLE))) ++ return; ++ ++ /* It's safe because the task is inactive. */ ++ raw_spin_lock_irqsave(&p->pi_lock, flags); ++ __do_set_cpus_allowed(p, cpumask_of(cpu)); ++ p->flags |= PF_NO_SETAFFINITY; ++ raw_spin_unlock_irqrestore(&p->pi_lock, flags); ++} ++#else ++#define new_kthread_bind(p, cpu) kthread_bind(p, cpu) ++#endif ++ + /** + * kthread_create_on_cpu - Create a cpu bound kthread + * @threadfn: the function to run until signal_pending(current). +@@ -454,7 +482,7 @@ struct task_struct *kthread_create_on_cpu(int (*threadfn)(void *data), + cpu); + if (IS_ERR(p)) + return p; +- kthread_bind(p, cpu); ++ new_kthread_bind(p, cpu); + /* CPU hotplug need to bind once again when unparking the thread. */ + set_bit(KTHREAD_IS_PER_CPU, &to_kthread(p)->flags); + to_kthread(p)->cpu = cpu; +diff --git a/kernel/livepatch/transition.c b/kernel/livepatch/transition.c +index f6310f848f34..825f9b8e228f 100644 +--- a/kernel/livepatch/transition.c ++++ b/kernel/livepatch/transition.c +@@ -282,7 +282,7 @@ static bool klp_try_switch_task(struct task_struct *task) + { + static char err_buf[STACK_ERR_BUF_SIZE]; + struct rq *rq; +- struct rq_flags flags; ++ struct rq_flags rf; + int ret; + bool success = false; + +@@ -304,7 +304,7 @@ static bool klp_try_switch_task(struct task_struct *task) + * functions. If all goes well, switch the task to the target patch + * state. + */ +- rq = task_rq_lock(task, &flags); ++ rq = task_rq_lock(task, &rf); + + if (task_running(rq, task) && task != current) { + snprintf(err_buf, STACK_ERR_BUF_SIZE, +@@ -323,7 +323,7 @@ static bool klp_try_switch_task(struct task_struct *task) + task->patch_state = klp_target_state; + + done: +- task_rq_unlock(rq, task, &flags); ++ task_rq_unlock(rq, task, &rf); + + /* + * Due to console deadlock issues, pr_debug() can't be used while +diff --git a/kernel/sched/Makefile b/kernel/sched/Makefile +index 21fb5a5662b5..a04ffebc6b7a 100644 +--- a/kernel/sched/Makefile ++++ b/kernel/sched/Makefile +@@ -16,15 +16,23 @@ ifneq ($(CONFIG_SCHED_OMIT_FRAME_POINTER),y) + CFLAGS_core.o := $(PROFILING) -fno-omit-frame-pointer + endif + ++ifdef CONFIG_SCHED_MUQSS ++obj-y += MuQSS.o clock.o cputime.o ++obj-y += idle.o ++obj-y += wait.o wait_bit.o swait.o completion.o ++ ++obj-$(CONFIG_SMP) += topology.o ++else + obj-y += core.o loadavg.o clock.o cputime.o + obj-y += idle.o fair.o rt.o deadline.o + obj-y += wait.o wait_bit.o swait.o completion.o + + obj-$(CONFIG_SMP) += cpupri.o cpudeadline.o topology.o stop_task.o pelt.o + obj-$(CONFIG_SCHED_AUTOGROUP) += autogroup.o +-obj-$(CONFIG_SCHEDSTATS) += stats.o + obj-$(CONFIG_SCHED_DEBUG) += debug.o + obj-$(CONFIG_CGROUP_CPUACCT) += cpuacct.o ++endif ++obj-$(CONFIG_SCHEDSTATS) += stats.o + obj-$(CONFIG_CPU_FREQ) += cpufreq.o + obj-$(CONFIG_CPU_FREQ_GOV_SCHEDUTIL) += cpufreq_schedutil.o + obj-$(CONFIG_MEMBARRIER) += membarrier.o +diff --git a/kernel/sched/MuQSS.c b/kernel/sched/MuQSS.c +new file mode 100644 +index 000000000000..b8b35546c416 +--- /dev/null ++++ b/kernel/sched/MuQSS.c +@@ -0,0 +1,7607 @@ ++// SPDX-License-Identifier: GPL-2.0 ++/* ++ * kernel/sched/MuQSS.c, was kernel/sched.c ++ * ++ * Kernel scheduler and related syscalls ++ * ++ * Copyright (C) 1991-2002 Linus Torvalds ++ * ++ * 1996-12-23 Modified by Dave Grothe to fix bugs in semaphores and ++ * make semaphores SMP safe ++ * 1998-11-19 Implemented schedule_timeout() and related stuff ++ * by Andrea Arcangeli ++ * 2002-01-04 New ultra-scalable O(1) scheduler by Ingo Molnar: ++ * hybrid priority-list and round-robin design with ++ * an array-switch method of distributing timeslices ++ * and per-CPU runqueues. Cleanups and useful suggestions ++ * by Davide Libenzi, preemptible kernel bits by Robert Love. ++ * 2003-09-03 Interactivity tuning by Con Kolivas. ++ * 2004-04-02 Scheduler domains code by Nick Piggin ++ * 2007-04-15 Work begun on replacing all interactivity tuning with a ++ * fair scheduling design by Con Kolivas. ++ * 2007-05-05 Load balancing (smp-nice) and other improvements ++ * by Peter Williams ++ * 2007-05-06 Interactivity improvements to CFS by Mike Galbraith ++ * 2007-07-01 Group scheduling enhancements by Srivatsa Vaddagiri ++ * 2007-11-29 RT balancing improvements by Steven Rostedt, Gregory Haskins, ++ * Thomas Gleixner, Mike Kravetz ++ * 2009-08-13 Brainfuck deadline scheduling policy by Con Kolivas deletes ++ * a whole lot of those previous things. ++ * 2016-10-01 Multiple Queue Skiplist Scheduler scalable evolution of BFS ++ * scheduler by Con Kolivas. ++ * 2019-08-31 LLC bits by Eduards Bezverhijs ++ */ ++ ++#include ++#include ++ ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++ ++#include ++#include ++#include ++ ++#include "../workqueue_internal.h" ++#include "../../fs/io-wq.h" ++#include "../smpboot.h" ++ ++#define CREATE_TRACE_POINTS ++#include ++ ++#include "MuQSS.h" ++ ++#define rt_prio(prio) unlikely((prio) < MAX_RT_PRIO) ++#define rt_task(p) rt_prio((p)->prio) ++#define batch_task(p) (unlikely((p)->policy == SCHED_BATCH)) ++#define is_rt_policy(policy) ((policy) == SCHED_FIFO || \ ++ (policy) == SCHED_RR) ++#define has_rt_policy(p) unlikely(is_rt_policy((p)->policy)) ++ ++#define is_idle_policy(policy) ((policy) == SCHED_IDLEPRIO) ++#define idleprio_task(p) unlikely(is_idle_policy((p)->policy)) ++#define task_running_idle(p) unlikely((p)->prio == IDLE_PRIO) ++ ++#define is_iso_policy(policy) ((policy) == SCHED_ISO) ++#define iso_task(p) unlikely(is_iso_policy((p)->policy)) ++#define task_running_iso(p) unlikely((p)->prio == ISO_PRIO) ++ ++#define rq_idle(rq) ((rq)->rq_prio == PRIO_LIMIT) ++ ++#define ISO_PERIOD (5 * HZ) ++ ++#define STOP_PRIO (MAX_RT_PRIO - 1) ++ ++/* ++ * Some helpers for converting to/from various scales. Use shifts to get ++ * approximate multiples of ten for less overhead. ++ */ ++#define APPROX_NS_PS (1073741824) /* Approximate ns per second */ ++#define JIFFIES_TO_NS(TIME) ((TIME) * (APPROX_NS_PS / HZ)) ++#define JIFFY_NS (APPROX_NS_PS / HZ) ++#define JIFFY_US (1048576 / HZ) ++#define NS_TO_JIFFIES(TIME) ((TIME) / JIFFY_NS) ++#define HALF_JIFFY_NS (APPROX_NS_PS / HZ / 2) ++#define HALF_JIFFY_US (1048576 / HZ / 2) ++#define MS_TO_NS(TIME) ((TIME) << 20) ++#define MS_TO_US(TIME) ((TIME) << 10) ++#define NS_TO_MS(TIME) ((TIME) >> 20) ++#define NS_TO_US(TIME) ((TIME) >> 10) ++#define US_TO_NS(TIME) ((TIME) << 10) ++#define TICK_APPROX_NS ((APPROX_NS_PS+HZ/2)/HZ) ++ ++#define RESCHED_US (100) /* Reschedule if less than this many μs left */ ++ ++void print_scheduler_version(void) ++{ ++ printk(KERN_INFO "MuQSS CPU scheduler v0.198 by Con Kolivas.\n"); ++} ++ ++/* Define RQ share levels */ ++#define RQSHARE_NONE 0 ++#define RQSHARE_SMT 1 ++#define RQSHARE_MC 2 ++#define RQSHARE_MC_LLC 3 ++#define RQSHARE_SMP 4 ++#define RQSHARE_ALL 5 ++ ++/* Define locality levels */ ++#define LOCALITY_SAME 0 ++#define LOCALITY_SMT 1 ++#define LOCALITY_MC_LLC 2 ++#define LOCALITY_MC 3 ++#define LOCALITY_SMP 4 ++#define LOCALITY_DISTANT 5 ++ ++/* ++ * This determines what level of runqueue sharing will be done and is ++ * configurable at boot time with the bootparam rqshare = ++ */ ++static int rqshare __read_mostly = CONFIG_SHARERQ; /* Default RQSHARE_MC */ ++ ++static int __init set_rqshare(char *str) ++{ ++ if (!strncmp(str, "none", 4)) { ++ rqshare = RQSHARE_NONE; ++ return 0; ++ } ++ if (!strncmp(str, "smt", 3)) { ++ rqshare = RQSHARE_SMT; ++ return 0; ++ } ++ if (!strncmp(str, "mc", 2)) { ++ rqshare = RQSHARE_MC; ++ return 0; ++ } ++ if (!strncmp(str, "llc", 3)) { ++ rqshare = RQSHARE_MC_LLC; ++ return 0; ++ } ++ if (!strncmp(str, "smp", 3)) { ++ rqshare = RQSHARE_SMP; ++ return 0; ++ } ++ if (!strncmp(str, "all", 3)) { ++ rqshare = RQSHARE_ALL; ++ return 0; ++ } ++ return 1; ++} ++__setup("rqshare=", set_rqshare); ++ ++/* ++ * This is the time all tasks within the same priority round robin. ++ * Value is in ms and set to a minimum of 6ms. ++ * Tunable via /proc interface. ++ */ ++int rr_interval __read_mostly = 6; ++ ++/* ++ * Tunable to choose whether to prioritise latency or throughput, simple ++ * binary yes or no ++ */ ++int sched_interactive __read_mostly = 1; ++ ++/* ++ * sched_iso_cpu - sysctl which determines the cpu percentage SCHED_ISO tasks ++ * are allowed to run five seconds as real time tasks. This is the total over ++ * all online cpus. ++ */ ++int sched_iso_cpu __read_mostly = 70; ++ ++/* ++ * sched_yield_type - Choose what sort of yield sched_yield will perform. ++ * 0: No yield. ++ * 1: Yield only to better priority/deadline tasks. (default) ++ * 2: Expire timeslice and recalculate deadline. ++ */ ++int sched_yield_type __read_mostly = 1; ++ ++/* ++ * The relative length of deadline for each priority(nice) level. ++ */ ++static int prio_ratios[NICE_WIDTH] __read_mostly; ++ ++ ++/* ++ * The quota handed out to tasks of all priority levels when refilling their ++ * time_slice. ++ */ ++static inline int timeslice(void) ++{ ++ return MS_TO_US(rr_interval); ++} ++ ++DEFINE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues); ++ ++#ifdef CONFIG_SMP ++/* ++ * Total number of runqueues. Equals number of CPUs when there is no runqueue ++ * sharing but is usually less with SMT/MC sharing of runqueues. ++ */ ++static int total_runqueues __read_mostly = 1; ++ ++static cpumask_t cpu_idle_map ____cacheline_aligned_in_smp; ++ ++struct rq *cpu_rq(int cpu) ++{ ++ return &per_cpu(runqueues, (cpu)); ++} ++#define cpu_curr(cpu) (cpu_rq(cpu)->curr) ++ ++/* ++ * For asym packing, by default the lower numbered cpu has higher priority. ++ */ ++int __weak arch_asym_cpu_priority(int cpu) ++{ ++ return -cpu; ++} ++ ++int __weak arch_sd_sibling_asym_packing(void) ++{ ++ return 0*SD_ASYM_PACKING; ++} ++ ++#ifdef CONFIG_SCHED_SMT ++DEFINE_STATIC_KEY_FALSE(sched_smt_present); ++EXPORT_SYMBOL_GPL(sched_smt_present); ++#endif ++ ++#else ++struct rq *uprq; ++#endif /* CONFIG_SMP */ ++ ++#include "stats.h" ++ ++/* ++ * All common locking functions performed on rq->lock. rq->clock is local to ++ * the CPU accessing it so it can be modified just with interrupts disabled ++ * when we're not updating niffies. ++ * Looking up task_rq must be done under rq->lock to be safe. ++ */ ++ ++/* ++ * RQ-clock updating methods: ++ */ ++ ++#ifdef HAVE_SCHED_AVG_IRQ ++static void update_irq_load_avg(struct rq *rq, long delta); ++#else ++static inline void update_irq_load_avg(struct rq *rq, long delta) {} ++#endif ++ ++static void update_rq_clock_task(struct rq *rq, s64 delta) ++{ ++/* ++ * In theory, the compile should just see 0 here, and optimize out the call ++ * to sched_rt_avg_update. But I don't trust it... ++ */ ++ s64 __maybe_unused steal = 0, irq_delta = 0; ++#ifdef CONFIG_IRQ_TIME_ACCOUNTING ++ irq_delta = irq_time_read(cpu_of(rq)) - rq->prev_irq_time; ++ ++ /* ++ * Since irq_time is only updated on {soft,}irq_exit, we might run into ++ * this case when a previous update_rq_clock() happened inside a ++ * {soft,}irq region. ++ * ++ * When this happens, we stop ->clock_task and only update the ++ * prev_irq_time stamp to account for the part that fit, so that a next ++ * update will consume the rest. This ensures ->clock_task is ++ * monotonic. ++ * ++ * It does however cause some slight miss-attribution of {soft,}irq ++ * time, a more accurate solution would be to update the irq_time using ++ * the current rq->clock timestamp, except that would require using ++ * atomic ops. ++ */ ++ if (irq_delta > delta) ++ irq_delta = delta; ++ ++ rq->prev_irq_time += irq_delta; ++ delta -= irq_delta; ++#endif ++#ifdef CONFIG_PARAVIRT_TIME_ACCOUNTING ++ if (static_key_false((¶virt_steal_rq_enabled))) { ++ steal = paravirt_steal_clock(cpu_of(rq)); ++ steal -= rq->prev_steal_time_rq; ++ ++ if (unlikely(steal > delta)) ++ steal = delta; ++ ++ rq->prev_steal_time_rq += steal; ++ delta -= steal; ++ } ++#endif ++ rq->clock_task += delta; ++ ++#ifdef CONFIG_HAVE_SCHED_AVG_IRQ ++ if (irq_delta + steal) ++ update_irq_load_avg(rq, irq_delta + steal); ++#endif ++} ++ ++static inline void update_rq_clock(struct rq *rq) ++{ ++ s64 delta = sched_clock_cpu(cpu_of(rq)) - rq->clock; ++ ++ if (unlikely(delta < 0)) ++ return; ++ rq->clock += delta; ++ update_rq_clock_task(rq, delta); ++} ++ ++/* ++ * Niffies are a globally increasing nanosecond counter. They're only used by ++ * update_load_avg and time_slice_expired, however deadlines are based on them ++ * across CPUs. Update them whenever we will call one of those functions, and ++ * synchronise them across CPUs whenever we hold both runqueue locks. ++ */ ++static inline void update_clocks(struct rq *rq) ++{ ++ s64 ndiff, minndiff; ++ long jdiff; ++ ++ update_rq_clock(rq); ++ ndiff = rq->clock - rq->old_clock; ++ rq->old_clock = rq->clock; ++ jdiff = jiffies - rq->last_jiffy; ++ ++ /* Subtract any niffies added by balancing with other rqs */ ++ ndiff -= rq->niffies - rq->last_niffy; ++ minndiff = JIFFIES_TO_NS(jdiff) - rq->niffies + rq->last_jiffy_niffies; ++ if (minndiff < 0) ++ minndiff = 0; ++ ndiff = max(ndiff, minndiff); ++ rq->niffies += ndiff; ++ rq->last_niffy = rq->niffies; ++ if (jdiff) { ++ rq->last_jiffy += jdiff; ++ rq->last_jiffy_niffies = rq->niffies; ++ } ++} ++ ++/* ++ * Any time we have two runqueues locked we use that as an opportunity to ++ * synchronise niffies to the highest value as idle ticks may have artificially ++ * kept niffies low on one CPU and the truth can only be later. ++ */ ++static inline void synchronise_niffies(struct rq *rq1, struct rq *rq2) ++{ ++ if (rq1->niffies > rq2->niffies) ++ rq2->niffies = rq1->niffies; ++ else ++ rq1->niffies = rq2->niffies; ++} ++ ++/* ++ * double_rq_lock - safely lock two runqueues ++ * ++ * Note this does not disable interrupts like task_rq_lock, ++ * you need to do so manually before calling. ++ */ ++ ++/* For when we know rq1 != rq2 */ ++static inline void __double_rq_lock(struct rq *rq1, struct rq *rq2) ++ __acquires(rq1->lock) ++ __acquires(rq2->lock) ++{ ++ if (rq1 < rq2) { ++ raw_spin_lock(rq1->lock); ++ raw_spin_lock_nested(rq2->lock, SINGLE_DEPTH_NESTING); ++ } else { ++ raw_spin_lock(rq2->lock); ++ raw_spin_lock_nested(rq1->lock, SINGLE_DEPTH_NESTING); ++ } ++} ++ ++static inline void double_rq_lock(struct rq *rq1, struct rq *rq2) ++ __acquires(rq1->lock) ++ __acquires(rq2->lock) ++{ ++ BUG_ON(!irqs_disabled()); ++ if (rq1->lock == rq2->lock) { ++ raw_spin_lock(rq1->lock); ++ __acquire(rq2->lock); /* Fake it out ;) */ ++ } else ++ __double_rq_lock(rq1, rq2); ++ synchronise_niffies(rq1, rq2); ++} ++ ++/* ++ * double_rq_unlock - safely unlock two runqueues ++ * ++ * Note this does not restore interrupts like task_rq_unlock, ++ * you need to do so manually after calling. ++ */ ++static inline void double_rq_unlock(struct rq *rq1, struct rq *rq2) ++ __releases(rq1->lock) ++ __releases(rq2->lock) ++{ ++ raw_spin_unlock(rq1->lock); ++ if (rq1->lock != rq2->lock) ++ raw_spin_unlock(rq2->lock); ++ else ++ __release(rq2->lock); ++} ++ ++static inline void lock_all_rqs(void) ++{ ++ int cpu; ++ ++ preempt_disable(); ++ for_each_possible_cpu(cpu) { ++ struct rq *rq = cpu_rq(cpu); ++ ++ do_raw_spin_lock(rq->lock); ++ } ++} ++ ++static inline void unlock_all_rqs(void) ++{ ++ int cpu; ++ ++ for_each_possible_cpu(cpu) { ++ struct rq *rq = cpu_rq(cpu); ++ ++ do_raw_spin_unlock(rq->lock); ++ } ++ preempt_enable(); ++} ++ ++/* Specially nest trylock an rq */ ++static inline bool trylock_rq(struct rq *this_rq, struct rq *rq) ++{ ++ if (unlikely(!do_raw_spin_trylock(rq->lock))) ++ return false; ++ spin_acquire(&rq->lock->dep_map, SINGLE_DEPTH_NESTING, 1, _RET_IP_); ++ synchronise_niffies(this_rq, rq); ++ return true; ++} ++ ++/* Unlock a specially nested trylocked rq */ ++static inline void unlock_rq(struct rq *rq) ++{ ++ spin_release(&rq->lock->dep_map, _RET_IP_); ++ do_raw_spin_unlock(rq->lock); ++} ++ ++/* ++ * cmpxchg based fetch_or, macro so it works for different integer types ++ */ ++#define fetch_or(ptr, mask) \ ++ ({ \ ++ typeof(ptr) _ptr = (ptr); \ ++ typeof(mask) _mask = (mask); \ ++ typeof(*_ptr) _old, _val = *_ptr; \ ++ \ ++ for (;;) { \ ++ _old = cmpxchg(_ptr, _val, _val | _mask); \ ++ if (_old == _val) \ ++ break; \ ++ _val = _old; \ ++ } \ ++ _old; \ ++}) ++ ++#if defined(CONFIG_SMP) && defined(TIF_POLLING_NRFLAG) ++/* ++ * Atomically set TIF_NEED_RESCHED and test for TIF_POLLING_NRFLAG, ++ * this avoids any races wrt polling state changes and thereby avoids ++ * spurious IPIs. ++ */ ++static bool set_nr_and_not_polling(struct task_struct *p) ++{ ++ struct thread_info *ti = task_thread_info(p); ++ return !(fetch_or(&ti->flags, _TIF_NEED_RESCHED) & _TIF_POLLING_NRFLAG); ++} ++ ++/* ++ * Atomically set TIF_NEED_RESCHED if TIF_POLLING_NRFLAG is set. ++ * ++ * If this returns true, then the idle task promises to call ++ * sched_ttwu_pending() and reschedule soon. ++ */ ++static bool set_nr_if_polling(struct task_struct *p) ++{ ++ struct thread_info *ti = task_thread_info(p); ++ typeof(ti->flags) old, val = READ_ONCE(ti->flags); ++ ++ for (;;) { ++ if (!(val & _TIF_POLLING_NRFLAG)) ++ return false; ++ if (val & _TIF_NEED_RESCHED) ++ return true; ++ old = cmpxchg(&ti->flags, val, val | _TIF_NEED_RESCHED); ++ if (old == val) ++ break; ++ val = old; ++ } ++ return true; ++} ++ ++#else ++static bool set_nr_and_not_polling(struct task_struct *p) ++{ ++ set_tsk_need_resched(p); ++ return true; ++} ++ ++#ifdef CONFIG_SMP ++static bool set_nr_if_polling(struct task_struct *p) ++{ ++ return false; ++} ++#endif ++#endif ++ ++static bool __wake_q_add(struct wake_q_head *head, struct task_struct *task) ++{ ++ struct wake_q_node *node = &task->wake_q; ++ ++ /* ++ * Atomically grab the task, if ->wake_q is !nil already it means ++ * its already queued (either by us or someone else) and will get the ++ * wakeup due to that. ++ * ++ * In order to ensure that a pending wakeup will observe our pending ++ * state, even in the failed case, an explicit smp_mb() must be used. ++ */ ++ smp_mb__before_atomic(); ++ if (unlikely(cmpxchg_relaxed(&node->next, NULL, WAKE_Q_TAIL))) ++ return false; ++ ++ /* ++ * The head is context local, there can be no concurrency. ++ */ ++ *head->lastp = node; ++ head->lastp = &node->next; ++ return true; ++} ++ ++/** ++ * wake_q_add() - queue a wakeup for 'later' waking. ++ * @head: the wake_q_head to add @task to ++ * @task: the task to queue for 'later' wakeup ++ * ++ * Queue a task for later wakeup, most likely by the wake_up_q() call in the ++ * same context, _HOWEVER_ this is not guaranteed, the wakeup can come ++ * instantly. ++ * ++ * This function must be used as-if it were wake_up_process(); IOW the task ++ * must be ready to be woken at this location. ++ */ ++void wake_q_add(struct wake_q_head *head, struct task_struct *task) ++{ ++ if (__wake_q_add(head, task)) ++ get_task_struct(task); ++} ++ ++/** ++ * wake_q_add_safe() - safely queue a wakeup for 'later' waking. ++ * @head: the wake_q_head to add @task to ++ * @task: the task to queue for 'later' wakeup ++ * ++ * Queue a task for later wakeup, most likely by the wake_up_q() call in the ++ * same context, _HOWEVER_ this is not guaranteed, the wakeup can come ++ * instantly. ++ * ++ * This function must be used as-if it were wake_up_process(); IOW the task ++ * must be ready to be woken at this location. ++ * ++ * This function is essentially a task-safe equivalent to wake_q_add(). Callers ++ * that already hold reference to @task can call the 'safe' version and trust ++ * wake_q to do the right thing depending whether or not the @task is already ++ * queued for wakeup. ++ */ ++void wake_q_add_safe(struct wake_q_head *head, struct task_struct *task) ++{ ++ if (!__wake_q_add(head, task)) ++ put_task_struct(task); ++} ++ ++void wake_up_q(struct wake_q_head *head) ++{ ++ struct wake_q_node *node = head->first; ++ ++ while (node != WAKE_Q_TAIL) { ++ struct task_struct *task; ++ ++ task = container_of(node, struct task_struct, wake_q); ++ BUG_ON(!task); ++ /* Task can safely be re-inserted now */ ++ node = node->next; ++ task->wake_q.next = NULL; ++ ++ /* ++ * wake_up_process() executes a full barrier, which pairs with ++ * the queueing in wake_q_add() so as not to miss wakeups. ++ */ ++ wake_up_process(task); ++ put_task_struct(task); ++ } ++} ++ ++static inline void smp_sched_reschedule(int cpu) ++{ ++ if (likely(cpu_online(cpu))) ++ smp_send_reschedule(cpu); ++} ++ ++/* ++ * resched_task - mark a task 'to be rescheduled now'. ++ * ++ * On UP this means the setting of the need_resched flag, on SMP it ++ * might also involve a cross-CPU call to trigger the scheduler on ++ * the target CPU. ++ */ ++void resched_task(struct task_struct *p) ++{ ++ int cpu; ++#ifdef CONFIG_LOCKDEP ++ /* Kernel threads call this when creating workqueues while still ++ * inactive from __kthread_bind_mask, holding only the pi_lock */ ++ if (!(p->flags & PF_KTHREAD)) { ++ struct rq *rq = task_rq(p); ++ ++ lockdep_assert_held(rq->lock); ++ } ++#endif ++ if (test_tsk_need_resched(p)) ++ return; ++ ++ cpu = task_cpu(p); ++ if (cpu == smp_processor_id()) { ++ set_tsk_need_resched(p); ++ set_preempt_need_resched(); ++ return; ++ } ++ ++ if (set_nr_and_not_polling(p)) ++ smp_sched_reschedule(cpu); ++ else ++ trace_sched_wake_idle_without_ipi(cpu); ++} ++ ++/* ++ * A task that is not running or queued will not have a node set. ++ * A task that is queued but not running will have a node set. ++ * A task that is currently running will have ->on_cpu set but no node set. ++ */ ++static inline bool task_queued(struct task_struct *p) ++{ ++ return !skiplist_node_empty(&p->node); ++} ++ ++static void enqueue_task(struct rq *rq, struct task_struct *p, int flags); ++static inline void resched_if_idle(struct rq *rq); ++ ++/* Dodgy workaround till we figure out where the softirqs are going */ ++static inline void do_pending_softirq(struct rq *rq, struct task_struct *next) ++{ ++ if (unlikely(next == rq->idle && local_softirq_pending() && !in_interrupt())) ++ do_softirq_own_stack(); ++} ++ ++static inline bool deadline_before(u64 deadline, u64 time) ++{ ++ return (deadline < time); ++} ++ ++/* ++ * Deadline is "now" in niffies + (offset by priority). Setting the deadline ++ * is the key to everything. It distributes cpu fairly amongst tasks of the ++ * same nice value, it proportions cpu according to nice level, it means the ++ * task that last woke up the longest ago has the earliest deadline, thus ++ * ensuring that interactive tasks get low latency on wake up. The CPU ++ * proportion works out to the square of the virtual deadline difference, so ++ * this equation will give nice 19 3% CPU compared to nice 0. ++ */ ++static inline u64 prio_deadline_diff(int user_prio) ++{ ++ return (prio_ratios[user_prio] * rr_interval * (MS_TO_NS(1) / 128)); ++} ++ ++static inline u64 task_deadline_diff(struct task_struct *p) ++{ ++ return prio_deadline_diff(TASK_USER_PRIO(p)); ++} ++ ++static inline u64 static_deadline_diff(int static_prio) ++{ ++ return prio_deadline_diff(USER_PRIO(static_prio)); ++} ++ ++static inline int longest_deadline_diff(void) ++{ ++ return prio_deadline_diff(39); ++} ++ ++static inline int ms_longest_deadline_diff(void) ++{ ++ return NS_TO_MS(longest_deadline_diff()); ++} ++ ++static inline bool rq_local(struct rq *rq); ++ ++#ifndef SCHED_CAPACITY_SCALE ++#define SCHED_CAPACITY_SCALE 1024 ++#endif ++ ++static inline int rq_load(struct rq *rq) ++{ ++ return rq->nr_running; ++} ++ ++/* ++ * Update the load average for feeding into cpu frequency governors. Use a ++ * rough estimate of a rolling average with ~ time constant of 32ms. ++ * 80/128 ~ 0.63. * 80 / 32768 / 128 == * 5 / 262144 ++ * Make sure a call to update_clocks has been made before calling this to get ++ * an updated rq->niffies. ++ */ ++static void update_load_avg(struct rq *rq, unsigned int flags) ++{ ++ long us_interval, load; ++ unsigned long curload; ++ ++ us_interval = NS_TO_US(rq->niffies - rq->load_update); ++ if (unlikely(us_interval <= 0)) ++ return; ++ ++ curload = rq_load(rq); ++ load = rq->load_avg - (rq->load_avg * us_interval * 5 / 262144); ++ if (unlikely(load < 0)) ++ load = 0; ++ load += curload * curload * SCHED_CAPACITY_SCALE * us_interval * 5 / 262144; ++ rq->load_avg = load; ++ ++ rq->load_update = rq->niffies; ++ update_irq_load_avg(rq, 0); ++ if (likely(rq_local(rq))) ++ cpufreq_trigger(rq, flags); ++} ++ ++#ifdef HAVE_SCHED_AVG_IRQ ++/* ++ * IRQ variant of update_load_avg below. delta is actually time in nanoseconds ++ * here so we scale curload to how long it's been since the last update. ++ */ ++static void update_irq_load_avg(struct rq *rq, long delta) ++{ ++ long us_interval, load; ++ unsigned long curload; ++ ++ us_interval = NS_TO_US(rq->niffies - rq->irq_load_update); ++ if (unlikely(us_interval <= 0)) ++ return; ++ ++ curload = NS_TO_US(delta) / us_interval; ++ load = rq->irq_load_avg - (rq->irq_load_avg * us_interval * 5 / 262144); ++ if (unlikely(load < 0)) ++ load = 0; ++ load += curload * curload * SCHED_CAPACITY_SCALE * us_interval * 5 / 262144; ++ rq->irq_load_avg = load; ++ ++ rq->irq_load_update = rq->niffies; ++} ++#endif ++ ++/* ++ * Removing from the runqueue. Enter with rq locked. Deleting a task ++ * from the skip list is done via the stored node reference in the task struct ++ * and does not require a full look up. Thus it occurs in O(k) time where k ++ * is the "level" of the list the task was stored at - usually < 4, max 8. ++ */ ++static void dequeue_task(struct rq *rq, struct task_struct *p, int flags) ++{ ++ skiplist_delete(rq->sl, &p->node); ++ rq->best_key = rq->node->next[0]->key; ++ update_clocks(rq); ++ ++ if (!(flags & DEQUEUE_SAVE)) { ++ sched_info_dequeued(rq, p); ++ psi_dequeue(p, flags & DEQUEUE_SLEEP); ++ } ++ rq->nr_running--; ++ if (rt_task(p)) ++ rq->rt_nr_running--; ++ update_load_avg(rq, flags); ++} ++ ++#ifdef CONFIG_PREEMPT_RCU ++static bool rcu_read_critical(struct task_struct *p) ++{ ++ return p->rcu_read_unlock_special.b.blocked; ++} ++#else /* CONFIG_PREEMPT_RCU */ ++#define rcu_read_critical(p) (false) ++#endif /* CONFIG_PREEMPT_RCU */ ++ ++/* ++ * To determine if it's safe for a task of SCHED_IDLEPRIO to actually run as ++ * an idle task, we ensure none of the following conditions are met. ++ */ ++static bool idleprio_suitable(struct task_struct *p) ++{ ++ return (!(task_contributes_to_load(p)) && !(p->flags & (PF_EXITING)) && ++ !signal_pending(p) && !rcu_read_critical(p) && !freezing(p)); ++} ++ ++/* ++ * To determine if a task of SCHED_ISO can run in pseudo-realtime, we check ++ * that the iso_refractory flag is not set. ++ */ ++static inline bool isoprio_suitable(struct rq *rq) ++{ ++ return !rq->iso_refractory; ++} ++ ++/* ++ * Adding to the runqueue. Enter with rq locked. ++ */ ++static void enqueue_task(struct rq *rq, struct task_struct *p, int flags) ++{ ++ unsigned int randseed, cflags = 0; ++ u64 sl_id; ++ ++ if (!rt_task(p)) { ++ /* Check it hasn't gotten rt from PI */ ++ if ((idleprio_task(p) && idleprio_suitable(p)) || ++ (iso_task(p) && isoprio_suitable(rq))) ++ p->prio = p->normal_prio; ++ else ++ p->prio = NORMAL_PRIO; ++ } else ++ rq->rt_nr_running++; ++ /* ++ * The sl_id key passed to the skiplist generates a sorted list. ++ * Realtime and sched iso tasks run FIFO so they only need be sorted ++ * according to priority. The skiplist will put tasks of the same ++ * key inserted later in FIFO order. Tasks of sched normal, batch ++ * and idleprio are sorted according to their deadlines. Idleprio ++ * tasks are offset by an impossibly large deadline value ensuring ++ * they get sorted into last positions, but still according to their ++ * own deadlines. This creates a "landscape" of skiplists running ++ * from priority 0 realtime in first place to the lowest priority ++ * idleprio tasks last. Skiplist insertion is an O(log n) process. ++ */ ++ if (p->prio <= ISO_PRIO) { ++ sl_id = p->prio; ++ } else { ++ sl_id = p->deadline; ++ if (idleprio_task(p)) { ++ if (p->prio == IDLE_PRIO) ++ sl_id |= 0xF000000000000000; ++ else ++ sl_id += longest_deadline_diff(); ++ } ++ } ++ /* ++ * Some architectures don't have better than microsecond resolution ++ * so mask out ~microseconds as the random seed for skiplist insertion. ++ */ ++ update_clocks(rq); ++ if (!(flags & ENQUEUE_RESTORE)) { ++ sched_info_queued(rq, p); ++ psi_enqueue(p, flags & ENQUEUE_WAKEUP); ++ } ++ ++ randseed = (rq->niffies >> 10) & 0xFFFFFFFF; ++ skiplist_insert(rq->sl, &p->node, sl_id, p, randseed); ++ rq->best_key = rq->node->next[0]->key; ++ if (p->in_iowait) ++ cflags |= SCHED_CPUFREQ_IOWAIT; ++ rq->nr_running++; ++ update_load_avg(rq, cflags); ++} ++ ++/* ++ * Returns the relative length of deadline all compared to the shortest ++ * deadline which is that of nice -20. ++ */ ++static inline int task_prio_ratio(struct task_struct *p) ++{ ++ return prio_ratios[TASK_USER_PRIO(p)]; ++} ++ ++/* ++ * task_timeslice - all tasks of all priorities get the exact same timeslice ++ * length. CPU distribution is handled by giving different deadlines to ++ * tasks of different priorities. Use 128 as the base value for fast shifts. ++ */ ++static inline int task_timeslice(struct task_struct *p) ++{ ++ return (rr_interval * task_prio_ratio(p) / 128); ++} ++ ++#ifdef CONFIG_SMP ++/* Entered with rq locked */ ++static inline void resched_if_idle(struct rq *rq) ++{ ++ if (rq_idle(rq)) ++ resched_task(rq->curr); ++} ++ ++static inline bool rq_local(struct rq *rq) ++{ ++ return (rq->cpu == smp_processor_id()); ++} ++#ifdef CONFIG_SMT_NICE ++static const cpumask_t *thread_cpumask(int cpu); ++ ++/* Find the best real time priority running on any SMT siblings of cpu and if ++ * none are running, the static priority of the best deadline task running. ++ * The lookups to the other runqueues is done lockless as the occasional wrong ++ * value would be harmless. */ ++static int best_smt_bias(struct rq *this_rq) ++{ ++ int other_cpu, best_bias = 0; ++ ++ for_each_cpu(other_cpu, &this_rq->thread_mask) { ++ struct rq *rq = cpu_rq(other_cpu); ++ ++ if (rq_idle(rq)) ++ continue; ++ if (unlikely(!rq->online)) ++ continue; ++ if (!rq->rq_mm) ++ continue; ++ if (likely(rq->rq_smt_bias > best_bias)) ++ best_bias = rq->rq_smt_bias; ++ } ++ return best_bias; ++} ++ ++static int task_prio_bias(struct task_struct *p) ++{ ++ if (rt_task(p)) ++ return 1 << 30; ++ else if (task_running_iso(p)) ++ return 1 << 29; ++ else if (task_running_idle(p)) ++ return 0; ++ return MAX_PRIO - p->static_prio; ++} ++ ++static bool smt_always_schedule(struct task_struct __maybe_unused *p, struct rq __maybe_unused *this_rq) ++{ ++ return true; ++} ++ ++static bool (*smt_schedule)(struct task_struct *p, struct rq *this_rq) = &smt_always_schedule; ++ ++/* We've already decided p can run on CPU, now test if it shouldn't for SMT ++ * nice reasons. */ ++static bool smt_should_schedule(struct task_struct *p, struct rq *this_rq) ++{ ++ int best_bias, task_bias; ++ ++ /* Kernel threads always run */ ++ if (unlikely(!p->mm)) ++ return true; ++ if (rt_task(p)) ++ return true; ++ if (!idleprio_suitable(p)) ++ return true; ++ best_bias = best_smt_bias(this_rq); ++ /* The smt siblings are all idle or running IDLEPRIO */ ++ if (best_bias < 1) ++ return true; ++ task_bias = task_prio_bias(p); ++ if (task_bias < 1) ++ return false; ++ if (task_bias >= best_bias) ++ return true; ++ /* Dither 25% cpu of normal tasks regardless of nice difference */ ++ if (best_bias % 4 == 1) ++ return true; ++ /* Sorry, you lose */ ++ return false; ++} ++#else /* CONFIG_SMT_NICE */ ++#define smt_schedule(p, this_rq) (true) ++#endif /* CONFIG_SMT_NICE */ ++ ++static inline void atomic_set_cpu(int cpu, cpumask_t *cpumask) ++{ ++ set_bit(cpu, (volatile unsigned long *)cpumask); ++} ++ ++/* ++ * The cpu_idle_map stores a bitmap of all the CPUs currently idle to ++ * allow easy lookup of whether any suitable idle CPUs are available. ++ * It's cheaper to maintain a binary yes/no if there are any idle CPUs on the ++ * idle_cpus variable than to do a full bitmask check when we are busy. The ++ * bits are set atomically but read locklessly as occasional false positive / ++ * negative is harmless. ++ */ ++static inline void set_cpuidle_map(int cpu) ++{ ++ if (likely(cpu_online(cpu))) ++ atomic_set_cpu(cpu, &cpu_idle_map); ++} ++ ++static inline void atomic_clear_cpu(int cpu, cpumask_t *cpumask) ++{ ++ clear_bit(cpu, (volatile unsigned long *)cpumask); ++} ++ ++static inline void clear_cpuidle_map(int cpu) ++{ ++ atomic_clear_cpu(cpu, &cpu_idle_map); ++} ++ ++static bool suitable_idle_cpus(struct task_struct *p) ++{ ++ return (cpumask_intersects(p->cpus_ptr, &cpu_idle_map)); ++} ++ ++/* ++ * Resched current on rq. We don't know if rq is local to this CPU nor if it ++ * is locked so we do not use an intermediate variable for the task to avoid ++ * having it dereferenced. ++ */ ++static void resched_curr(struct rq *rq) ++{ ++ int cpu; ++ ++ if (test_tsk_need_resched(rq->curr)) ++ return; ++ ++ rq->preempt = rq->curr; ++ cpu = rq->cpu; ++ ++ /* We're doing this without holding the rq lock if it's not task_rq */ ++ ++ if (cpu == smp_processor_id()) { ++ set_tsk_need_resched(rq->curr); ++ set_preempt_need_resched(); ++ return; ++ } ++ ++ if (set_nr_and_not_polling(rq->curr)) ++ smp_sched_reschedule(cpu); ++ else ++ trace_sched_wake_idle_without_ipi(cpu); ++} ++ ++#define CPUIDLE_DIFF_THREAD (1) ++#define CPUIDLE_DIFF_CORE_LLC (2) ++#define CPUIDLE_DIFF_CORE (4) ++#define CPUIDLE_CACHE_BUSY (8) ++#define CPUIDLE_DIFF_CPU (16) ++#define CPUIDLE_THREAD_BUSY (32) ++#define CPUIDLE_DIFF_NODE (64) ++ ++/* ++ * The best idle CPU is chosen according to the CPUIDLE ranking above where the ++ * lowest value would give the most suitable CPU to schedule p onto next. The ++ * order works out to be the following: ++ * ++ * Same thread, idle or busy cache, idle or busy threads ++ * Other core, same cache, idle or busy cache, idle threads. ++ * Same node, other CPU, idle cache, idle threads. ++ * Same node, other CPU, busy cache, idle threads. ++ * Other core, same cache, busy threads. ++ * Same node, other CPU, busy threads. ++ * Other node, other CPU, idle cache, idle threads. ++ * Other node, other CPU, busy cache, idle threads. ++ * Other node, other CPU, busy threads. ++ */ ++static int best_mask_cpu(int best_cpu, struct rq *rq, cpumask_t *tmpmask) ++{ ++ int best_ranking = CPUIDLE_DIFF_NODE | CPUIDLE_THREAD_BUSY | ++ CPUIDLE_DIFF_CPU | CPUIDLE_CACHE_BUSY | CPUIDLE_DIFF_CORE | ++ CPUIDLE_DIFF_CORE_LLC | CPUIDLE_DIFF_THREAD; ++ int cpu_tmp; ++ ++ if (cpumask_test_cpu(best_cpu, tmpmask)) ++ goto out; ++ ++ for_each_cpu(cpu_tmp, tmpmask) { ++ int ranking, locality; ++ struct rq *tmp_rq; ++ ++ ranking = 0; ++ tmp_rq = cpu_rq(cpu_tmp); ++ ++ locality = rq->cpu_locality[cpu_tmp]; ++#ifdef CONFIG_NUMA ++ if (locality > LOCALITY_SMP) ++ ranking |= CPUIDLE_DIFF_NODE; ++ else ++#endif ++ if (locality > LOCALITY_MC) ++ ranking |= CPUIDLE_DIFF_CPU; ++#ifdef CONFIG_SCHED_MC ++ else if (locality == LOCALITY_MC_LLC) ++ ranking |= CPUIDLE_DIFF_CORE_LLC; ++ else if (locality == LOCALITY_MC) ++ ranking |= CPUIDLE_DIFF_CORE; ++ if (!(tmp_rq->cache_idle(tmp_rq))) ++ ranking |= CPUIDLE_CACHE_BUSY; ++#endif ++#ifdef CONFIG_SCHED_SMT ++ if (locality == LOCALITY_SMT) ++ ranking |= CPUIDLE_DIFF_THREAD; ++#endif ++ if (ranking < best_ranking ++#ifdef CONFIG_SCHED_SMT ++ || (ranking == best_ranking && (tmp_rq->siblings_idle(tmp_rq))) ++#endif ++ ) { ++ best_cpu = cpu_tmp; ++ best_ranking = ranking; ++ } ++ } ++out: ++ return best_cpu; ++} ++ ++bool cpus_share_cache(int this_cpu, int that_cpu) ++{ ++ struct rq *this_rq = cpu_rq(this_cpu); ++ ++ return (this_rq->cpu_locality[that_cpu] < LOCALITY_SMP); ++} ++ ++/* As per resched_curr but only will resched idle task */ ++static inline void resched_idle(struct rq *rq) ++{ ++ if (test_tsk_need_resched(rq->idle)) ++ return; ++ ++ rq->preempt = rq->idle; ++ ++ set_tsk_need_resched(rq->idle); ++ ++ if (rq_local(rq)) { ++ set_preempt_need_resched(); ++ return; ++ } ++ ++ smp_sched_reschedule(rq->cpu); ++} ++ ++static struct rq *resched_best_idle(struct task_struct *p, int cpu) ++{ ++ cpumask_t tmpmask; ++ struct rq *rq; ++ int best_cpu; ++ ++ cpumask_and(&tmpmask, p->cpus_ptr, &cpu_idle_map); ++ best_cpu = best_mask_cpu(cpu, task_rq(p), &tmpmask); ++ rq = cpu_rq(best_cpu); ++ if (!smt_schedule(p, rq)) ++ return NULL; ++ rq->preempt = p; ++ resched_idle(rq); ++ return rq; ++} ++ ++static inline void resched_suitable_idle(struct task_struct *p) ++{ ++ if (suitable_idle_cpus(p)) ++ resched_best_idle(p, task_cpu(p)); ++} ++ ++static inline struct rq *rq_order(struct rq *rq, int cpu) ++{ ++ return rq->rq_order[cpu]; ++} ++#else /* CONFIG_SMP */ ++static inline void set_cpuidle_map(int cpu) ++{ ++} ++ ++static inline void clear_cpuidle_map(int cpu) ++{ ++} ++ ++static inline bool suitable_idle_cpus(struct task_struct *p) ++{ ++ return uprq->curr == uprq->idle; ++} ++ ++static inline void resched_suitable_idle(struct task_struct *p) ++{ ++} ++ ++static inline void resched_curr(struct rq *rq) ++{ ++ resched_task(rq->curr); ++} ++ ++static inline void resched_if_idle(struct rq *rq) ++{ ++} ++ ++static inline bool rq_local(struct rq *rq) ++{ ++ return true; ++} ++ ++static inline struct rq *rq_order(struct rq *rq, int cpu) ++{ ++ return rq; ++} ++ ++static inline bool smt_schedule(struct task_struct *p, struct rq *rq) ++{ ++ return true; ++} ++#endif /* CONFIG_SMP */ ++ ++static inline int normal_prio(struct task_struct *p) ++{ ++ if (has_rt_policy(p)) ++ return MAX_RT_PRIO - 1 - p->rt_priority; ++ if (idleprio_task(p)) ++ return IDLE_PRIO; ++ if (iso_task(p)) ++ return ISO_PRIO; ++ return NORMAL_PRIO; ++} ++ ++/* ++ * Calculate the current priority, i.e. the priority ++ * taken into account by the scheduler. This value might ++ * be boosted by RT tasks as it will be RT if the task got ++ * RT-boosted. If not then it returns p->normal_prio. ++ */ ++static int effective_prio(struct task_struct *p) ++{ ++ p->normal_prio = normal_prio(p); ++ /* ++ * If we are RT tasks or we were boosted to RT priority, ++ * keep the priority unchanged. Otherwise, update priority ++ * to the normal priority: ++ */ ++ if (!rt_prio(p->prio)) ++ return p->normal_prio; ++ return p->prio; ++} ++ ++/* ++ * activate_task - move a task to the runqueue. Enter with rq locked. ++ */ ++static void activate_task(struct rq *rq, struct task_struct *p, int flags) ++{ ++ resched_if_idle(rq); ++ ++ /* ++ * Sleep time is in units of nanosecs, so shift by 20 to get a ++ * milliseconds-range estimation of the amount of time that the task ++ * spent sleeping: ++ */ ++ if (unlikely(prof_on == SLEEP_PROFILING)) { ++ if (p->state == TASK_UNINTERRUPTIBLE) ++ profile_hits(SLEEP_PROFILING, (void *)get_wchan(p), ++ (rq->niffies - p->last_ran) >> 20); ++ } ++ ++ p->prio = effective_prio(p); ++ if (task_contributes_to_load(p)) ++ rq->nr_uninterruptible--; ++ ++ enqueue_task(rq, p, flags); ++ p->on_rq = TASK_ON_RQ_QUEUED; ++} ++ ++/* ++ * deactivate_task - If it's running, it's not on the runqueue and we can just ++ * decrement the nr_running. Enter with rq locked. ++ */ ++static inline void deactivate_task(struct task_struct *p, struct rq *rq) ++{ ++ if (task_contributes_to_load(p)) ++ rq->nr_uninterruptible++; ++ ++ p->on_rq = 0; ++ sched_info_dequeued(rq, p); ++ /* deactivate_task is always DEQUEUE_SLEEP in muqss */ ++ psi_dequeue(p, DEQUEUE_SLEEP); ++} ++ ++#ifdef CONFIG_SMP ++void set_task_cpu(struct task_struct *p, unsigned int new_cpu) ++{ ++ struct rq *rq; ++ ++ if (task_cpu(p) == new_cpu) ++ return; ++ ++ /* Do NOT call set_task_cpu on a currently queued task as we will not ++ * be reliably holding the rq lock after changing CPU. */ ++ BUG_ON(task_queued(p)); ++ rq = task_rq(p); ++ ++#ifdef CONFIG_LOCKDEP ++ /* ++ * The caller should hold either p->pi_lock or rq->lock, when changing ++ * a task's CPU. ->pi_lock for waking tasks, rq->lock for runnable tasks. ++ * ++ * Furthermore, all task_rq users should acquire both locks, see ++ * task_rq_lock(). ++ */ ++ WARN_ON_ONCE(debug_locks && !(lockdep_is_held(&p->pi_lock) || ++ lockdep_is_held(rq->lock))); ++#endif ++ ++ trace_sched_migrate_task(p, new_cpu); ++ rseq_migrate(p); ++ perf_event_task_migrate(p); ++ ++ /* ++ * After ->cpu is set up to a new value, task_rq_lock(p, ...) can be ++ * successfully executed on another CPU. We must ensure that updates of ++ * per-task data have been completed by this moment. ++ */ ++ smp_wmb(); ++ ++ p->wake_cpu = new_cpu; ++ ++ if (task_running(rq, p)) { ++ /* ++ * We should only be calling this on a running task if we're ++ * holding rq lock. ++ */ ++ lockdep_assert_held(rq->lock); ++ ++ /* ++ * We can't change the task_thread_info CPU on a running task ++ * as p will still be protected by the rq lock of the CPU it ++ * is still running on so we only set the wake_cpu for it to be ++ * lazily updated once off the CPU. ++ */ ++ return; ++ } ++ ++#ifdef CONFIG_THREAD_INFO_IN_TASK ++ WRITE_ONCE(p->cpu, new_cpu); ++#else ++ WRITE_ONCE(task_thread_info(p)->cpu, new_cpu); ++#endif ++ /* We're no longer protecting p after this point since we're holding ++ * the wrong runqueue lock. */ ++} ++#endif /* CONFIG_SMP */ ++ ++/* ++ * Move a task off the runqueue and take it to a cpu for it will ++ * become the running task. ++ */ ++static inline void take_task(struct rq *rq, int cpu, struct task_struct *p) ++{ ++ struct rq *p_rq = task_rq(p); ++ ++ dequeue_task(p_rq, p, DEQUEUE_SAVE); ++ if (p_rq != rq) { ++ sched_info_dequeued(p_rq, p); ++ sched_info_queued(rq, p); ++ } ++ set_task_cpu(p, cpu); ++} ++ ++/* ++ * Returns a descheduling task to the runqueue unless it is being ++ * deactivated. ++ */ ++static inline void return_task(struct task_struct *p, struct rq *rq, ++ int cpu, bool deactivate) ++{ ++ if (deactivate) ++ deactivate_task(p, rq); ++ else { ++#ifdef CONFIG_SMP ++ /* ++ * set_task_cpu was called on the running task that doesn't ++ * want to deactivate so it has to be enqueued to a different ++ * CPU and we need its lock. Tag it to be moved with as the ++ * lock is dropped in finish_lock_switch. ++ */ ++ if (unlikely(p->wake_cpu != cpu)) ++ WRITE_ONCE(p->on_rq, TASK_ON_RQ_MIGRATING); ++ else ++#endif ++ enqueue_task(rq, p, ENQUEUE_RESTORE); ++ } ++} ++ ++/* Enter with rq lock held. We know p is on the local cpu */ ++static inline void __set_tsk_resched(struct task_struct *p) ++{ ++ set_tsk_need_resched(p); ++ set_preempt_need_resched(); ++} ++ ++/** ++ * task_curr - is this task currently executing on a CPU? ++ * @p: the task in question. ++ * ++ * Return: 1 if the task is currently executing. 0 otherwise. ++ */ ++inline int task_curr(const struct task_struct *p) ++{ ++ return cpu_curr(task_cpu(p)) == p; ++} ++ ++#ifdef CONFIG_SMP ++/* ++ * wait_task_inactive - wait for a thread to unschedule. ++ * ++ * If @match_state is nonzero, it's the @p->state value just checked and ++ * not expected to change. If it changes, i.e. @p might have woken up, ++ * then return zero. When we succeed in waiting for @p to be off its CPU, ++ * we return a positive number (its total switch count). If a second call ++ * a short while later returns the same number, the caller can be sure that ++ * @p has remained unscheduled the whole time. ++ * ++ * The caller must ensure that the task *will* unschedule sometime soon, ++ * else this function might spin for a *long* time. This function can't ++ * be called with interrupts off, or it may introduce deadlock with ++ * smp_call_function() if an IPI is sent by the same process we are ++ * waiting to become inactive. ++ */ ++unsigned long wait_task_inactive(struct task_struct *p, long match_state) ++{ ++ int running, queued; ++ struct rq_flags rf; ++ unsigned long ncsw; ++ struct rq *rq; ++ ++ for (;;) { ++ rq = task_rq(p); ++ ++ /* ++ * If the task is actively running on another CPU ++ * still, just relax and busy-wait without holding ++ * any locks. ++ * ++ * NOTE! Since we don't hold any locks, it's not ++ * even sure that "rq" stays as the right runqueue! ++ * But we don't care, since this will return false ++ * if the runqueue has changed and p is actually now ++ * running somewhere else! ++ */ ++ while (task_running(rq, p)) { ++ if (match_state && unlikely(p->state != match_state)) ++ return 0; ++ cpu_relax(); ++ } ++ ++ /* ++ * Ok, time to look more closely! We need the rq ++ * lock now, to be *sure*. If we're wrong, we'll ++ * just go back and repeat. ++ */ ++ rq = task_rq_lock(p, &rf); ++ trace_sched_wait_task(p); ++ running = task_running(rq, p); ++ queued = task_on_rq_queued(p); ++ ncsw = 0; ++ if (!match_state || p->state == match_state) ++ ncsw = p->nvcsw | LONG_MIN; /* sets MSB */ ++ task_rq_unlock(rq, p, &rf); ++ ++ /* ++ * If it changed from the expected state, bail out now. ++ */ ++ if (unlikely(!ncsw)) ++ break; ++ ++ /* ++ * Was it really running after all now that we ++ * checked with the proper locks actually held? ++ * ++ * Oops. Go back and try again.. ++ */ ++ if (unlikely(running)) { ++ cpu_relax(); ++ continue; ++ } ++ ++ /* ++ * It's not enough that it's not actively running, ++ * it must be off the runqueue _entirely_, and not ++ * preempted! ++ * ++ * So if it was still runnable (but just not actively ++ * running right now), it's preempted, and we should ++ * yield - it could be a while. ++ */ ++ if (unlikely(queued)) { ++ ktime_t to = NSEC_PER_SEC / HZ; ++ ++ set_current_state(TASK_UNINTERRUPTIBLE); ++ schedule_hrtimeout(&to, HRTIMER_MODE_REL); ++ continue; ++ } ++ ++ /* ++ * Ahh, all good. It wasn't running, and it wasn't ++ * runnable, which means that it will never become ++ * running in the future either. We're all done! ++ */ ++ break; ++ } ++ ++ return ncsw; ++} ++ ++/*** ++ * kick_process - kick a running thread to enter/exit the kernel ++ * @p: the to-be-kicked thread ++ * ++ * Cause a process which is running on another CPU to enter ++ * kernel-mode, without any delay. (to get signals handled.) ++ * ++ * NOTE: this function doesn't have to take the runqueue lock, ++ * because all it wants to ensure is that the remote task enters ++ * the kernel. If the IPI races and the task has been migrated ++ * to another CPU then no harm is done and the purpose has been ++ * achieved as well. ++ */ ++void kick_process(struct task_struct *p) ++{ ++ int cpu; ++ ++ preempt_disable(); ++ cpu = task_cpu(p); ++ if ((cpu != smp_processor_id()) && task_curr(p)) ++ smp_sched_reschedule(cpu); ++ preempt_enable(); ++} ++EXPORT_SYMBOL_GPL(kick_process); ++#endif ++ ++/* ++ * RT tasks preempt purely on priority. SCHED_NORMAL tasks preempt on the ++ * basis of earlier deadlines. SCHED_IDLEPRIO don't preempt anything else or ++ * between themselves, they cooperatively multitask. An idle rq scores as ++ * prio PRIO_LIMIT so it is always preempted. ++ */ ++static inline bool ++can_preempt(struct task_struct *p, int prio, u64 deadline) ++{ ++ /* Better static priority RT task or better policy preemption */ ++ if (p->prio < prio) ++ return true; ++ if (p->prio > prio) ++ return false; ++ if (p->policy == SCHED_BATCH) ++ return false; ++ /* SCHED_NORMAL and ISO will preempt based on deadline */ ++ if (!deadline_before(p->deadline, deadline)) ++ return false; ++ return true; ++} ++ ++#ifdef CONFIG_SMP ++ ++static inline bool is_per_cpu_kthread(struct task_struct *p) ++{ ++ if (!(p->flags & PF_KTHREAD)) ++ return false; ++ ++ if (p->nr_cpus_allowed != 1) ++ return false; ++ ++ return true; ++} ++ ++/* ++ * Per-CPU kthreads are allowed to run on !active && online CPUs, see ++ * __set_cpus_allowed_ptr(). ++ */ ++static inline bool is_cpu_allowed(struct task_struct *p, int cpu) ++{ ++ if (!cpumask_test_cpu(cpu, p->cpus_ptr)) ++ return false; ++ ++ if (is_per_cpu_kthread(p)) ++ return cpu_online(cpu); ++ ++ return cpu_active(cpu); ++} ++ ++/* ++ * Check to see if p can run on cpu, and if not, whether there are any online ++ * CPUs it can run on instead. This only happens with the hotplug threads that ++ * bring up the CPUs. ++ */ ++static inline bool sched_other_cpu(struct task_struct *p, int cpu) ++{ ++ if (likely(cpumask_test_cpu(cpu, p->cpus_ptr))) ++ return false; ++ if (p->nr_cpus_allowed == 1) { ++ cpumask_t valid_mask; ++ ++ cpumask_and(&valid_mask, p->cpus_ptr, cpu_online_mask); ++ if (unlikely(cpumask_empty(&valid_mask))) ++ return false; ++ } ++ return true; ++} ++ ++static inline bool needs_other_cpu(struct task_struct *p, int cpu) ++{ ++ if (cpumask_test_cpu(cpu, p->cpus_ptr)) ++ return false; ++ return true; ++} ++ ++#define cpu_online_map (*(cpumask_t *)cpu_online_mask) ++ ++static void try_preempt(struct task_struct *p, struct rq *this_rq) ++{ ++ int i, this_entries = rq_load(this_rq); ++ cpumask_t tmp; ++ ++ if (suitable_idle_cpus(p) && resched_best_idle(p, task_cpu(p))) ++ return; ++ ++ /* IDLEPRIO tasks never preempt anything but idle */ ++ if (p->policy == SCHED_IDLEPRIO) ++ return; ++ ++ cpumask_and(&tmp, &cpu_online_map, p->cpus_ptr); ++ ++ for (i = 0; i < num_online_cpus(); i++) { ++ struct rq *rq = this_rq->cpu_order[i]; ++ ++ if (!cpumask_test_cpu(rq->cpu, &tmp)) ++ continue; ++ ++ if (!sched_interactive && rq != this_rq && rq_load(rq) <= this_entries) ++ continue; ++ if (smt_schedule(p, rq) && can_preempt(p, rq->rq_prio, rq->rq_deadline)) { ++ /* We set rq->preempting lockless, it's a hint only */ ++ rq->preempting = p; ++ resched_curr(rq); ++ return; ++ } ++ } ++} ++ ++static int __set_cpus_allowed_ptr(struct task_struct *p, ++ const struct cpumask *new_mask, bool check); ++#else /* CONFIG_SMP */ ++static inline bool needs_other_cpu(struct task_struct *p, int cpu) ++{ ++ return false; ++} ++ ++static void try_preempt(struct task_struct *p, struct rq *this_rq) ++{ ++ if (p->policy == SCHED_IDLEPRIO) ++ return; ++ if (can_preempt(p, uprq->rq_prio, uprq->rq_deadline)) ++ resched_curr(uprq); ++} ++ ++static inline int __set_cpus_allowed_ptr(struct task_struct *p, ++ const struct cpumask *new_mask, bool check) ++{ ++ return set_cpus_allowed_ptr(p, new_mask); ++} ++#endif /* CONFIG_SMP */ ++ ++static void ++ttwu_stat(struct task_struct *p, int cpu, int wake_flags) ++{ ++ struct rq *rq; ++ ++ if (!schedstat_enabled()) ++ return; ++ ++ rq = this_rq(); ++ ++#ifdef CONFIG_SMP ++ if (cpu == rq->cpu) { ++ __schedstat_inc(rq->ttwu_local); ++ } else { ++ struct sched_domain *sd; ++ ++ rcu_read_lock(); ++ for_each_domain(rq->cpu, sd) { ++ if (cpumask_test_cpu(cpu, sched_domain_span(sd))) { ++ __schedstat_inc(sd->ttwu_wake_remote); ++ break; ++ } ++ } ++ rcu_read_unlock(); ++ } ++ ++#endif /* CONFIG_SMP */ ++ ++ __schedstat_inc(rq->ttwu_count); ++} ++ ++/* ++ * Mark the task runnable and perform wakeup-preemption. ++ */ ++static void ttwu_do_wakeup(struct rq *rq, struct task_struct *p, int wake_flags) ++{ ++ /* ++ * Sync wakeups (i.e. those types of wakeups where the waker ++ * has indicated that it will leave the CPU in short order) ++ * don't trigger a preemption if there are no idle cpus, ++ * instead waiting for current to deschedule. ++ */ ++ if (wake_flags & WF_SYNC) ++ resched_suitable_idle(p); ++ else ++ try_preempt(p, rq); ++ p->state = TASK_RUNNING; ++ trace_sched_wakeup(p); ++} ++ ++static void ++ttwu_do_activate(struct rq *rq, struct task_struct *p, int wake_flags) ++{ ++ int en_flags = ENQUEUE_WAKEUP; ++ ++ lockdep_assert_held(rq->lock); ++ ++#ifdef CONFIG_SMP ++ if (p->sched_contributes_to_load) ++ rq->nr_uninterruptible--; ++ ++ if (wake_flags & WF_MIGRATED) ++ en_flags |= ENQUEUE_MIGRATED; ++#endif ++ ++ activate_task(rq, p, en_flags); ++ ttwu_do_wakeup(rq, p, wake_flags); ++} ++ ++/* ++ * Called in case the task @p isn't fully descheduled from its runqueue, ++ * in this case we must do a remote wakeup. Its a 'light' wakeup though, ++ * since all we need to do is flip p->state to TASK_RUNNING, since ++ * the task is still ->on_rq. ++ */ ++static int ttwu_remote(struct task_struct *p, int wake_flags) ++{ ++ struct rq *rq; ++ int ret = 0; ++ ++ rq = __task_rq_lock(p, NULL); ++ if (likely(task_on_rq_queued(p))) { ++ ttwu_do_wakeup(rq, p, wake_flags); ++ ret = 1; ++ } ++ __task_rq_unlock(rq, NULL); ++ ++ return ret; ++} ++ ++#ifdef CONFIG_SMP ++void sched_ttwu_pending(void) ++{ ++ struct rq *rq = this_rq(); ++ struct llist_node *llist = llist_del_all(&rq->wake_list); ++ struct task_struct *p, *t; ++ struct rq_flags rf; ++ ++ if (!llist) ++ return; ++ ++ rq_lock_irqsave(rq, &rf); ++ ++ llist_for_each_entry_safe(p, t, llist, wake_entry) ++ ttwu_do_activate(rq, p, 0); ++ ++ rq_unlock_irqrestore(rq, &rf); ++} ++ ++void scheduler_ipi(void) ++{ ++ /* ++ * Fold TIF_NEED_RESCHED into the preempt_count; anybody setting ++ * TIF_NEED_RESCHED remotely (for the first time) will also send ++ * this IPI. ++ */ ++ preempt_fold_need_resched(); ++ ++ if (llist_empty(&this_rq()->wake_list) && (!idle_cpu(smp_processor_id()) || need_resched())) ++ return; ++ ++ /* ++ * Not all reschedule IPI handlers call irq_enter/irq_exit, since ++ * traditionally all their work was done from the interrupt return ++ * path. Now that we actually do some work, we need to make sure ++ * we do call them. ++ * ++ * Some archs already do call them, luckily irq_enter/exit nest ++ * properly. ++ * ++ * Arguably we should visit all archs and update all handlers, ++ * however a fair share of IPIs are still resched only so this would ++ * somewhat pessimize the simple resched case. ++ */ ++ irq_enter(); ++ sched_ttwu_pending(); ++ irq_exit(); ++} ++ ++static void ttwu_queue_remote(struct task_struct *p, int cpu, int wake_flags) ++{ ++ struct rq *rq = cpu_rq(cpu); ++ ++ if (llist_add(&p->wake_entry, &cpu_rq(cpu)->wake_list)) { ++ if (!set_nr_if_polling(rq->idle)) ++ smp_sched_reschedule(cpu); ++ else ++ trace_sched_wake_idle_without_ipi(cpu); ++ } ++} ++ ++void wake_up_if_idle(int cpu) ++{ ++ struct rq *rq = cpu_rq(cpu); ++ struct rq_flags rf; ++ ++ rcu_read_lock(); ++ ++ if (!is_idle_task(rcu_dereference(rq->curr))) ++ goto out; ++ ++ if (set_nr_if_polling(rq->idle)) { ++ trace_sched_wake_idle_without_ipi(cpu); ++ } else { ++ rq_lock_irqsave(rq, &rf); ++ if (likely(is_idle_task(rq->curr))) ++ smp_sched_reschedule(cpu); ++ /* Else cpu is not in idle, do nothing here */ ++ rq_unlock_irqrestore(rq, &rf); ++ } ++ ++out: ++ rcu_read_unlock(); ++} ++ ++static int valid_task_cpu(struct task_struct *p) ++{ ++ cpumask_t valid_mask; ++ ++ if (p->flags & PF_KTHREAD) ++ cpumask_and(&valid_mask, p->cpus_ptr, cpu_all_mask); ++ else ++ cpumask_and(&valid_mask, p->cpus_ptr, cpu_active_mask); ++ ++ if (unlikely(!cpumask_weight(&valid_mask))) { ++ /* We shouldn't be hitting this any more */ ++ printk(KERN_WARNING "SCHED: No cpumask for %s/%d weight %d\n", p->comm, ++ p->pid, cpumask_weight(p->cpus_ptr)); ++ return cpumask_any(p->cpus_ptr); ++ } ++ return cpumask_any(&valid_mask); ++} ++ ++/* ++ * For a task that's just being woken up we have a valuable balancing ++ * opportunity so choose the nearest cache most lightly loaded runqueue. ++ * Entered with rq locked and returns with the chosen runqueue locked. ++ */ ++static inline int select_best_cpu(struct task_struct *p) ++{ ++ unsigned int idlest = ~0U; ++ struct rq *rq = NULL; ++ int i; ++ ++ if (suitable_idle_cpus(p)) { ++ int cpu = task_cpu(p); ++ ++ if (unlikely(needs_other_cpu(p, cpu))) ++ cpu = valid_task_cpu(p); ++ rq = resched_best_idle(p, cpu); ++ if (likely(rq)) ++ return rq->cpu; ++ } ++ ++ for (i = 0; i < num_online_cpus(); i++) { ++ struct rq *other_rq = task_rq(p)->cpu_order[i]; ++ int entries; ++ ++ if (!other_rq->online) ++ continue; ++ if (needs_other_cpu(p, other_rq->cpu)) ++ continue; ++ entries = rq_load(other_rq); ++ if (entries >= idlest) ++ continue; ++ idlest = entries; ++ rq = other_rq; ++ } ++ if (unlikely(!rq)) ++ return task_cpu(p); ++ return rq->cpu; ++} ++#else /* CONFIG_SMP */ ++static int valid_task_cpu(struct task_struct *p) ++{ ++ return 0; ++} ++ ++static inline int select_best_cpu(struct task_struct *p) ++{ ++ return 0; ++} ++ ++static struct rq *resched_best_idle(struct task_struct *p, int cpu) ++{ ++ return NULL; ++} ++#endif /* CONFIG_SMP */ ++ ++static void ttwu_queue(struct task_struct *p, int cpu, int wake_flags) ++{ ++ struct rq *rq = cpu_rq(cpu); ++ ++#if defined(CONFIG_SMP) ++ if (!cpus_share_cache(smp_processor_id(), cpu)) { ++ sched_clock_cpu(cpu); /* Sync clocks across CPUs */ ++ ttwu_queue_remote(p, cpu, wake_flags); ++ return; ++ } ++#endif ++ rq_lock(rq); ++ ttwu_do_activate(rq, p, wake_flags); ++ rq_unlock(rq); ++} ++ ++/*** ++ * try_to_wake_up - wake up a thread ++ * @p: the thread to be awakened ++ * @state: the mask of task states that can be woken ++ * @wake_flags: wake modifier flags (WF_*) ++ * ++ * Put it on the run-queue if it's not already there. The "current" ++ * thread is always on the run-queue (except when the actual ++ * re-schedule is in progress), and as such you're allowed to do ++ * the simpler "current->state = TASK_RUNNING" to mark yourself ++ * runnable without the overhead of this. ++ * ++ * Return: %true if @p was woken up, %false if it was already running. ++ * or @state didn't match @p's state. ++ */ ++static int ++try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags) ++{ ++ unsigned long flags; ++ int cpu, success = 0; ++ ++ preempt_disable(); ++ if (p == current) { ++ /* ++ * We're waking current, this means 'p->on_rq' and 'task_cpu(p) ++ * == smp_processor_id()'. Together this means we can special ++ * case the whole 'p->on_rq && ttwu_remote()' case below ++ * without taking any locks. ++ * ++ * In particular: ++ * - we rely on Program-Order guarantees for all the ordering, ++ * - we're serialized against set_special_state() by virtue of ++ * it disabling IRQs (this allows not taking ->pi_lock). ++ */ ++ if (!(p->state & state)) ++ goto out; ++ ++ success = 1; ++ cpu = task_cpu(p); ++ trace_sched_waking(p); ++ p->state = TASK_RUNNING; ++ trace_sched_wakeup(p); ++ goto out; ++ } ++ ++ /* ++ * If we are going to wake up a thread waiting for CONDITION we ++ * need to ensure that CONDITION=1 done by the caller can not be ++ * reordered with p->state check below. This pairs with mb() in ++ * set_current_state() the waiting thread does. ++ */ ++ raw_spin_lock_irqsave(&p->pi_lock, flags); ++ smp_mb__after_spinlock(); ++ if (!(p->state & state)) ++ goto unlock; ++ ++ trace_sched_waking(p); ++ ++ /* We're going to change ->state: */ ++ success = 1; ++ cpu = task_cpu(p); ++ ++ /* ++ * Ensure we load p->on_rq _after_ p->state, otherwise it would ++ * be possible to, falsely, observe p->on_rq == 0 and get stuck ++ * in smp_cond_load_acquire() below. ++ * ++ * sched_ttwu_pending() try_to_wake_up() ++ * STORE p->on_rq = 1 LOAD p->state ++ * UNLOCK rq->lock ++ * ++ * __schedule() (switch to task 'p') ++ * LOCK rq->lock smp_rmb(); ++ * smp_mb__after_spinlock(); ++ * UNLOCK rq->lock ++ * ++ * [task p] ++ * STORE p->state = UNINTERRUPTIBLE LOAD p->on_rq ++ * ++ * Pairs with the LOCK+smp_mb__after_spinlock() on rq->lock in ++ * __schedule(). See the comment for smp_mb__after_spinlock(). ++ */ ++ smp_rmb(); ++ if (p->on_rq && ttwu_remote(p, wake_flags)) ++ goto unlock; ++ ++#ifdef CONFIG_SMP ++ /* ++ * Ensure we load p->on_cpu _after_ p->on_rq, otherwise it would be ++ * possible to, falsely, observe p->on_cpu == 0. ++ * ++ * One must be running (->on_cpu == 1) in order to remove oneself ++ * from the runqueue. ++ * ++ * __schedule() (switch to task 'p') try_to_wake_up() ++ * STORE p->on_cpu = 1 LOAD p->on_rq ++ * UNLOCK rq->lock ++ * ++ * __schedule() (put 'p' to sleep) ++ * LOCK rq->lock smp_rmb(); ++ * smp_mb__after_spinlock(); ++ * STORE p->on_rq = 0 LOAD p->on_cpu ++ * ++ * Pairs with the LOCK+smp_mb__after_spinlock() on rq->lock in ++ * __schedule(). See the comment for smp_mb__after_spinlock(). ++ */ ++ smp_rmb(); ++ ++ /* ++ * If the owning (remote) CPU is still in the middle of schedule() with ++ * this task as prev, wait until its done referencing the task. ++ * ++ * Pairs with the smp_store_release() in finish_task(). ++ * ++ * This ensures that tasks getting woken will be fully ordered against ++ * their previous state and preserve Program Order. ++ */ ++ smp_cond_load_acquire(&p->on_cpu, !VAL); ++ ++ p->sched_contributes_to_load = !!task_contributes_to_load(p); ++ p->state = TASK_WAKING; ++ ++ if (p->in_iowait) { ++ delayacct_blkio_end(p); ++ atomic_dec(&task_rq(p)->nr_iowait); ++ } ++ ++ cpu = select_best_cpu(p); ++ if (task_cpu(p) != cpu) { ++ wake_flags |= WF_MIGRATED; ++ psi_ttwu_dequeue(p); ++ set_task_cpu(p, cpu); ++ } ++ ++#else /* CONFIG_SMP */ ++ ++ if (p->in_iowait) { ++ delayacct_blkio_end(p); ++ atomic_dec(&task_rq(p)->nr_iowait); ++ } ++ ++#endif /* CONFIG_SMP */ ++ ++ ttwu_queue(p, cpu, wake_flags); ++unlock: ++ raw_spin_unlock_irqrestore(&p->pi_lock, flags); ++out: ++ if (success) ++ ttwu_stat(p, cpu, wake_flags); ++ preempt_enable(); ++ ++ return success; ++} ++ ++/** ++ * wake_up_process - Wake up a specific process ++ * @p: The process to be woken up. ++ * ++ * Attempt to wake up the nominated process and move it to the set of runnable ++ * processes. ++ * ++ * Return: 1 if the process was woken up, 0 if it was already running. ++ * ++ * This function executes a full memory barrier before accessing the task state. ++ */ ++int wake_up_process(struct task_struct *p) ++{ ++ return try_to_wake_up(p, TASK_NORMAL, 0); ++} ++EXPORT_SYMBOL(wake_up_process); ++ ++int wake_up_state(struct task_struct *p, unsigned int state) ++{ ++ return try_to_wake_up(p, state, 0); ++} ++ ++static void time_slice_expired(struct task_struct *p, struct rq *rq); ++ ++/* ++ * Perform scheduler related setup for a newly forked process p. ++ * p is forked by current. ++ */ ++int sched_fork(unsigned long __maybe_unused clone_flags, struct task_struct *p) ++{ ++ unsigned long flags; ++ ++#ifdef CONFIG_PREEMPT_NOTIFIERS ++ INIT_HLIST_HEAD(&p->preempt_notifiers); ++#endif ++ ++#ifdef CONFIG_COMPACTION ++ p->capture_control = NULL; ++#endif ++ ++ /* ++ * We mark the process as NEW here. This guarantees that ++ * nobody will actually run it, and a signal or other external ++ * event cannot wake it up and insert it on the runqueue either. ++ */ ++ p->state = TASK_NEW; ++ ++ /* ++ * The process state is set to the same value of the process executing ++ * do_fork() code. That is running. This guarantees that nobody will ++ * actually run it, and a signal or other external event cannot wake ++ * it up and insert it on the runqueue either. ++ */ ++ ++ /* Should be reset in fork.c but done here for ease of MuQSS patching */ ++ p->on_cpu = ++ p->on_rq = ++ p->utime = ++ p->stime = ++ p->sched_time = ++ p->stime_ns = ++ p->utime_ns = 0; ++ skiplist_node_init(&p->node); ++ ++ /* ++ * Revert to default priority/policy on fork if requested. ++ */ ++ if (unlikely(p->sched_reset_on_fork)) { ++ if (p->policy == SCHED_FIFO || p->policy == SCHED_RR || p-> policy == SCHED_ISO) { ++ p->policy = SCHED_NORMAL; ++ p->normal_prio = normal_prio(p); ++ } ++ ++ if (PRIO_TO_NICE(p->static_prio) < 0) { ++ p->static_prio = NICE_TO_PRIO(0); ++ p->normal_prio = p->static_prio; ++ } ++ ++ /* ++ * We don't need the reset flag anymore after the fork. It has ++ * fulfilled its duty: ++ */ ++ p->sched_reset_on_fork = 0; ++ } ++ ++ /* ++ * Silence PROVE_RCU. ++ */ ++ raw_spin_lock_irqsave(&p->pi_lock, flags); ++ set_task_cpu(p, smp_processor_id()); ++ raw_spin_unlock_irqrestore(&p->pi_lock, flags); ++ ++#ifdef CONFIG_SCHED_INFO ++ if (unlikely(sched_info_on())) ++ memset(&p->sched_info, 0, sizeof(p->sched_info)); ++#endif ++ init_task_preempt_count(p); ++ ++ return 0; ++} ++ ++#ifdef CONFIG_SCHEDSTATS ++ ++DEFINE_STATIC_KEY_FALSE(sched_schedstats); ++static bool __initdata __sched_schedstats = false; ++ ++static void set_schedstats(bool enabled) ++{ ++ if (enabled) ++ static_branch_enable(&sched_schedstats); ++ else ++ static_branch_disable(&sched_schedstats); ++} ++ ++void force_schedstat_enabled(void) ++{ ++ if (!schedstat_enabled()) { ++ pr_info("kernel profiling enabled schedstats, disable via kernel.sched_schedstats.\n"); ++ static_branch_enable(&sched_schedstats); ++ } ++} ++ ++static int __init setup_schedstats(char *str) ++{ ++ int ret = 0; ++ if (!str) ++ goto out; ++ ++ /* ++ * This code is called before jump labels have been set up, so we can't ++ * change the static branch directly just yet. Instead set a temporary ++ * variable so init_schedstats() can do it later. ++ */ ++ if (!strcmp(str, "enable")) { ++ __sched_schedstats = true; ++ ret = 1; ++ } else if (!strcmp(str, "disable")) { ++ __sched_schedstats = false; ++ ret = 1; ++ } ++out: ++ if (!ret) ++ pr_warn("Unable to parse schedstats=\n"); ++ ++ return ret; ++} ++__setup("schedstats=", setup_schedstats); ++ ++static void __init init_schedstats(void) ++{ ++ set_schedstats(__sched_schedstats); ++} ++ ++#ifdef CONFIG_PROC_SYSCTL ++int sysctl_schedstats(struct ctl_table *table, int write, ++ void __user *buffer, size_t *lenp, loff_t *ppos) ++{ ++ struct ctl_table t; ++ int err; ++ int state = static_branch_likely(&sched_schedstats); ++ ++ if (write && !capable(CAP_SYS_ADMIN)) ++ return -EPERM; ++ ++ t = *table; ++ t.data = &state; ++ err = proc_dointvec_minmax(&t, write, buffer, lenp, ppos); ++ if (err < 0) ++ return err; ++ if (write) ++ set_schedstats(state); ++ return err; ++} ++#endif /* CONFIG_PROC_SYSCTL */ ++#else /* !CONFIG_SCHEDSTATS */ ++static inline void init_schedstats(void) {} ++#endif /* CONFIG_SCHEDSTATS */ ++ ++static void update_cpu_clock_switch(struct rq *rq, struct task_struct *p); ++ ++static void account_task_cpu(struct rq *rq, struct task_struct *p) ++{ ++ update_clocks(rq); ++ /* This isn't really a context switch but accounting is the same */ ++ update_cpu_clock_switch(rq, p); ++ p->last_ran = rq->niffies; ++} ++ ++bool sched_smp_initialized __read_mostly; ++ ++static inline int hrexpiry_enabled(struct rq *rq) ++{ ++ if (unlikely(!cpu_active(cpu_of(rq)) || !sched_smp_initialized)) ++ return 0; ++ return hrtimer_is_hres_active(&rq->hrexpiry_timer); ++} ++ ++/* ++ * Use HR-timers to deliver accurate preemption points. ++ */ ++static inline void hrexpiry_clear(struct rq *rq) ++{ ++ if (!hrexpiry_enabled(rq)) ++ return; ++ if (hrtimer_active(&rq->hrexpiry_timer)) ++ hrtimer_cancel(&rq->hrexpiry_timer); ++} ++ ++/* ++ * High-resolution time_slice expiry. ++ * Runs from hardirq context with interrupts disabled. ++ */ ++static enum hrtimer_restart hrexpiry(struct hrtimer *timer) ++{ ++ struct rq *rq = container_of(timer, struct rq, hrexpiry_timer); ++ struct task_struct *p; ++ ++ /* This can happen during CPU hotplug / resume */ ++ if (unlikely(cpu_of(rq) != smp_processor_id())) ++ goto out; ++ ++ /* ++ * We're doing this without the runqueue lock but this should always ++ * be run on the local CPU. Time slice should run out in __schedule ++ * but we set it to zero here in case niffies is slightly less. ++ */ ++ p = rq->curr; ++ p->time_slice = 0; ++ __set_tsk_resched(p); ++out: ++ return HRTIMER_NORESTART; ++} ++ ++/* ++ * Called to set the hrexpiry timer state. ++ * ++ * called with irqs disabled from the local CPU only ++ */ ++static void hrexpiry_start(struct rq *rq, u64 delay) ++{ ++ if (!hrexpiry_enabled(rq)) ++ return; ++ ++ hrtimer_start(&rq->hrexpiry_timer, ns_to_ktime(delay), ++ HRTIMER_MODE_REL_PINNED); ++} ++ ++static void init_rq_hrexpiry(struct rq *rq) ++{ ++ hrtimer_init(&rq->hrexpiry_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); ++ rq->hrexpiry_timer.function = hrexpiry; ++} ++ ++static inline int rq_dither(struct rq *rq) ++{ ++ if (!hrexpiry_enabled(rq)) ++ return HALF_JIFFY_US; ++ return 0; ++} ++ ++/* ++ * wake_up_new_task - wake up a newly created task for the first time. ++ * ++ * This function will do some initial scheduler statistics housekeeping ++ * that must be done for every newly created context, then puts the task ++ * on the runqueue and wakes it. ++ */ ++void wake_up_new_task(struct task_struct *p) ++{ ++ struct task_struct *parent, *rq_curr; ++ struct rq *rq, *new_rq; ++ unsigned long flags; ++ ++ parent = p->parent; ++ ++ raw_spin_lock_irqsave(&p->pi_lock, flags); ++ p->state = TASK_RUNNING; ++ /* Task_rq can't change yet on a new task */ ++ new_rq = rq = task_rq(p); ++ if (unlikely(needs_other_cpu(p, task_cpu(p)))) { ++ set_task_cpu(p, valid_task_cpu(p)); ++ new_rq = task_rq(p); ++ } ++ ++ double_rq_lock(rq, new_rq); ++ rq_curr = rq->curr; ++ ++ /* ++ * Make sure we do not leak PI boosting priority to the child. ++ */ ++ p->prio = rq_curr->normal_prio; ++ ++ trace_sched_wakeup_new(p); ++ ++ /* ++ * Share the timeslice between parent and child, thus the ++ * total amount of pending timeslices in the system doesn't change, ++ * resulting in more scheduling fairness. If it's negative, it won't ++ * matter since that's the same as being 0. rq->rq_deadline is only ++ * modified within schedule() so it is always equal to ++ * current->deadline. ++ */ ++ account_task_cpu(rq, rq_curr); ++ p->last_ran = rq_curr->last_ran; ++ if (likely(rq_curr->policy != SCHED_FIFO)) { ++ rq_curr->time_slice /= 2; ++ if (rq_curr->time_slice < RESCHED_US) { ++ /* ++ * Forking task has run out of timeslice. Reschedule it and ++ * start its child with a new time slice and deadline. The ++ * child will end up running first because its deadline will ++ * be slightly earlier. ++ */ ++ __set_tsk_resched(rq_curr); ++ time_slice_expired(p, new_rq); ++ if (suitable_idle_cpus(p)) ++ resched_best_idle(p, task_cpu(p)); ++ else if (unlikely(rq != new_rq)) ++ try_preempt(p, new_rq); ++ } else { ++ p->time_slice = rq_curr->time_slice; ++ if (rq_curr == parent && rq == new_rq && !suitable_idle_cpus(p)) { ++ /* ++ * The VM isn't cloned, so we're in a good position to ++ * do child-runs-first in anticipation of an exec. This ++ * usually avoids a lot of COW overhead. ++ */ ++ __set_tsk_resched(rq_curr); ++ } else { ++ /* ++ * Adjust the hrexpiry since rq_curr will keep ++ * running and its timeslice has been shortened. ++ */ ++ hrexpiry_start(rq, US_TO_NS(rq_curr->time_slice)); ++ try_preempt(p, new_rq); ++ } ++ } ++ } else { ++ time_slice_expired(p, new_rq); ++ try_preempt(p, new_rq); ++ } ++ activate_task(new_rq, p, 0); ++ double_rq_unlock(rq, new_rq); ++ raw_spin_unlock_irqrestore(&p->pi_lock, flags); ++} ++ ++#ifdef CONFIG_PREEMPT_NOTIFIERS ++ ++static DEFINE_STATIC_KEY_FALSE(preempt_notifier_key); ++ ++void preempt_notifier_inc(void) ++{ ++ static_branch_inc(&preempt_notifier_key); ++} ++EXPORT_SYMBOL_GPL(preempt_notifier_inc); ++ ++void preempt_notifier_dec(void) ++{ ++ static_branch_dec(&preempt_notifier_key); ++} ++EXPORT_SYMBOL_GPL(preempt_notifier_dec); ++ ++/** ++ * preempt_notifier_register - tell me when current is being preempted & rescheduled ++ * @notifier: notifier struct to register ++ */ ++void preempt_notifier_register(struct preempt_notifier *notifier) ++{ ++ if (!static_branch_unlikely(&preempt_notifier_key)) ++ WARN(1, "registering preempt_notifier while notifiers disabled\n"); ++ ++ hlist_add_head(¬ifier->link, ¤t->preempt_notifiers); ++} ++EXPORT_SYMBOL_GPL(preempt_notifier_register); ++ ++/** ++ * preempt_notifier_unregister - no longer interested in preemption notifications ++ * @notifier: notifier struct to unregister ++ * ++ * This is *not* safe to call from within a preemption notifier. ++ */ ++void preempt_notifier_unregister(struct preempt_notifier *notifier) ++{ ++ hlist_del(¬ifier->link); ++} ++EXPORT_SYMBOL_GPL(preempt_notifier_unregister); ++ ++static void __fire_sched_in_preempt_notifiers(struct task_struct *curr) ++{ ++ struct preempt_notifier *notifier; ++ ++ hlist_for_each_entry(notifier, &curr->preempt_notifiers, link) ++ notifier->ops->sched_in(notifier, raw_smp_processor_id()); ++} ++ ++static __always_inline void fire_sched_in_preempt_notifiers(struct task_struct *curr) ++{ ++ if (static_branch_unlikely(&preempt_notifier_key)) ++ __fire_sched_in_preempt_notifiers(curr); ++} ++ ++static void ++__fire_sched_out_preempt_notifiers(struct task_struct *curr, ++ struct task_struct *next) ++{ ++ struct preempt_notifier *notifier; ++ ++ hlist_for_each_entry(notifier, &curr->preempt_notifiers, link) ++ notifier->ops->sched_out(notifier, next); ++} ++ ++static __always_inline void ++fire_sched_out_preempt_notifiers(struct task_struct *curr, ++ struct task_struct *next) ++{ ++ if (static_branch_unlikely(&preempt_notifier_key)) ++ __fire_sched_out_preempt_notifiers(curr, next); ++} ++ ++#else /* !CONFIG_PREEMPT_NOTIFIERS */ ++ ++static inline void fire_sched_in_preempt_notifiers(struct task_struct *curr) ++{ ++} ++ ++static inline void ++fire_sched_out_preempt_notifiers(struct task_struct *curr, ++ struct task_struct *next) ++{ ++} ++ ++#endif /* CONFIG_PREEMPT_NOTIFIERS */ ++ ++static inline void prepare_task(struct task_struct *next) ++{ ++ /* ++ * Claim the task as running, we do this before switching to it ++ * such that any running task will have this set. ++ */ ++ next->on_cpu = 1; ++} ++ ++static inline void finish_task(struct task_struct *prev) ++{ ++#ifdef CONFIG_SMP ++ /* ++ * After ->on_cpu is cleared, the task can be moved to a different CPU. ++ * We must ensure this doesn't happen until the switch is completely ++ * finished. ++ * ++ * In particular, the load of prev->state in finish_task_switch() must ++ * happen before this. ++ * ++ * Pairs with the smp_cond_load_acquire() in try_to_wake_up(). ++ */ ++ smp_store_release(&prev->on_cpu, 0); ++#endif ++} ++ ++static inline void ++prepare_lock_switch(struct rq *rq, struct task_struct *next) ++{ ++ /* ++ * Since the runqueue lock will be released by the next ++ * task (which is an invalid locking op but in the case ++ * of the scheduler it's an obvious special-case), so we ++ * do an early lockdep release here: ++ */ ++ spin_release(&rq->lock->dep_map, _THIS_IP_); ++#ifdef CONFIG_DEBUG_SPINLOCK ++ /* this is a valid case when another task releases the spinlock */ ++ rq->lock->owner = next; ++#endif ++} ++ ++static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev) ++{ ++ /* ++ * If we are tracking spinlock dependencies then we have to ++ * fix up the runqueue lock - which gets 'carried over' from ++ * prev into current: ++ */ ++ spin_acquire(&rq->lock->dep_map, 0, 0, _THIS_IP_); ++ ++#ifdef CONFIG_SMP ++ /* ++ * If prev was marked as migrating to another CPU in return_task, drop ++ * the local runqueue lock but leave interrupts disabled and grab the ++ * remote lock we're migrating it to before enabling them. ++ */ ++ if (unlikely(task_on_rq_migrating(prev))) { ++ sched_info_dequeued(rq, prev); ++ /* ++ * We move the ownership of prev to the new cpu now. ttwu can't ++ * activate prev to the wrong cpu since it has to grab this ++ * runqueue in ttwu_remote. ++ */ ++#ifdef CONFIG_THREAD_INFO_IN_TASK ++ prev->cpu = prev->wake_cpu; ++#else ++ task_thread_info(prev)->cpu = prev->wake_cpu; ++#endif ++ raw_spin_unlock(rq->lock); ++ ++ raw_spin_lock(&prev->pi_lock); ++ rq = __task_rq_lock(prev, NULL); ++ /* Check that someone else hasn't already queued prev */ ++ if (likely(!task_queued(prev))) { ++ enqueue_task(rq, prev, 0); ++ prev->on_rq = TASK_ON_RQ_QUEUED; ++ /* Wake up the CPU if it's not already running */ ++ resched_if_idle(rq); ++ } ++ raw_spin_unlock(&prev->pi_lock); ++ } ++#endif ++ rq_unlock(rq); ++ ++ do_pending_softirq(rq, current); ++ ++ local_irq_enable(); ++} ++ ++#ifndef prepare_arch_switch ++# define prepare_arch_switch(next) do { } while (0) ++#endif ++#ifndef finish_arch_switch ++# define finish_arch_switch(prev) do { } while (0) ++#endif ++#ifndef finish_arch_post_lock_switch ++# define finish_arch_post_lock_switch() do { } while (0) ++#endif ++ ++/** ++ * prepare_task_switch - prepare to switch tasks ++ * @rq: the runqueue preparing to switch ++ * @next: the task we are going to switch to. ++ * ++ * This is called with the rq lock held and interrupts off. It must ++ * be paired with a subsequent finish_task_switch after the context ++ * switch. ++ * ++ * prepare_task_switch sets up locking and calls architecture specific ++ * hooks. ++ */ ++static inline void ++prepare_task_switch(struct rq *rq, struct task_struct *prev, ++ struct task_struct *next) ++{ ++ kcov_prepare_switch(prev); ++ sched_info_switch(rq, prev, next); ++ perf_event_task_sched_out(prev, next); ++ rseq_preempt(prev); ++ fire_sched_out_preempt_notifiers(prev, next); ++ prepare_task(next); ++ prepare_arch_switch(next); ++} ++ ++/** ++ * finish_task_switch - clean up after a task-switch ++ * @rq: runqueue associated with task-switch ++ * @prev: the thread we just switched away from. ++ * ++ * finish_task_switch must be called after the context switch, paired ++ * with a prepare_task_switch call before the context switch. ++ * finish_task_switch will reconcile locking set up by prepare_task_switch, ++ * and do any other architecture-specific cleanup actions. ++ * ++ * Note that we may have delayed dropping an mm in context_switch(). If ++ * so, we finish that here outside of the runqueue lock. (Doing it ++ * with the lock held can cause deadlocks; see schedule() for ++ * details.) ++ * ++ * The context switch have flipped the stack from under us and restored the ++ * local variables which were saved when this task called schedule() in the ++ * past. prev == current is still correct but we need to recalculate this_rq ++ * because prev may have moved to another CPU. ++ */ ++static void finish_task_switch(struct task_struct *prev) ++ __releases(rq->lock) ++{ ++ struct rq *rq = this_rq(); ++ struct mm_struct *mm = rq->prev_mm; ++ long prev_state; ++ ++ /* ++ * The previous task will have left us with a preempt_count of 2 ++ * because it left us after: ++ * ++ * schedule() ++ * preempt_disable(); // 1 ++ * __schedule() ++ * raw_spin_lock_irq(rq->lock) // 2 ++ * ++ * Also, see FORK_PREEMPT_COUNT. ++ */ ++ if (WARN_ONCE(preempt_count() != 2*PREEMPT_DISABLE_OFFSET, ++ "corrupted preempt_count: %s/%d/0x%x\n", ++ current->comm, current->pid, preempt_count())) ++ preempt_count_set(FORK_PREEMPT_COUNT); ++ ++ rq->prev_mm = NULL; ++ ++ /* ++ * A task struct has one reference for the use as "current". ++ * If a task dies, then it sets TASK_DEAD in tsk->state and calls ++ * schedule one last time. The schedule call will never return, and ++ * the scheduled task must drop that reference. ++ * ++ * We must observe prev->state before clearing prev->on_cpu (in ++ * finish_task), otherwise a concurrent wakeup can get prev ++ * running on another CPU and we could rave with its RUNNING -> DEAD ++ * transition, resulting in a double drop. ++ */ ++ prev_state = prev->state; ++ vtime_task_switch(prev); ++ perf_event_task_sched_in(prev, current); ++ finish_task(prev); ++ finish_lock_switch(rq, prev); ++ finish_arch_post_lock_switch(); ++ kcov_finish_switch(current); ++ ++ fire_sched_in_preempt_notifiers(current); ++ /* ++ * When switching through a kernel thread, the loop in ++ * membarrier_{private,global}_expedited() may have observed that ++ * kernel thread and not issued an IPI. It is therefore possible to ++ * schedule between user->kernel->user threads without passing though ++ * switch_mm(). Membarrier requires a barrier after storing to ++ * rq->curr, before returning to userspace, so provide them here: ++ * ++ * - a full memory barrier for {PRIVATE,GLOBAL}_EXPEDITED, implicitly ++ * provided by mmdrop(), ++ * - a sync_core for SYNC_CORE. ++ */ ++ if (mm) { ++ membarrier_mm_sync_core_before_usermode(mm); ++ mmdrop(mm); ++ } ++ if (unlikely(prev_state == TASK_DEAD)) { ++ /* ++ * Remove function-return probe instances associated with this ++ * task and put them back on the free list. ++ */ ++ kprobe_flush_task(prev); ++ ++ /* Task is done with its stack. */ ++ put_task_stack(prev); ++ ++ put_task_struct_rcu_user(prev); ++ } ++} ++ ++/** ++ * schedule_tail - first thing a freshly forked thread must call. ++ * @prev: the thread we just switched away from. ++ */ ++asmlinkage __visible void schedule_tail(struct task_struct *prev) ++{ ++ /* ++ * New tasks start with FORK_PREEMPT_COUNT, see there and ++ * finish_task_switch() for details. ++ * ++ * finish_task_switch() will drop rq->lock() and lower preempt_count ++ * and the preempt_enable() will end up enabling preemption (on ++ * PREEMPT_COUNT kernels). ++ */ ++ ++ finish_task_switch(prev); ++ preempt_enable(); ++ ++ if (current->set_child_tid) ++ put_user(task_pid_vnr(current), current->set_child_tid); ++ ++ calculate_sigpending(); ++} ++ ++/* ++ * context_switch - switch to the new MM and the new thread's register state. ++ */ ++static __always_inline void ++context_switch(struct rq *rq, struct task_struct *prev, ++ struct task_struct *next) ++{ ++ prepare_task_switch(rq, prev, next); ++ ++ /* ++ * For paravirt, this is coupled with an exit in switch_to to ++ * combine the page table reload and the switch backend into ++ * one hypercall. ++ */ ++ arch_start_context_switch(prev); ++ ++ /* ++ * kernel -> kernel lazy + transfer active ++ * user -> kernel lazy + mmgrab() active ++ * ++ * kernel -> user switch + mmdrop() active ++ * user -> user switch ++ */ ++ if (!next->mm) { // to kernel ++ enter_lazy_tlb(prev->active_mm, next); ++ ++ next->active_mm = prev->active_mm; ++ if (prev->mm) // from user ++ mmgrab(prev->active_mm); ++ else ++ prev->active_mm = NULL; ++ } else { // to user ++ membarrier_switch_mm(rq, prev->active_mm, next->mm); ++ /* ++ * sys_membarrier() requires an smp_mb() between setting ++ * rq->curr / membarrier_switch_mm() and returning to userspace. ++ * ++ * The below provides this either through switch_mm(), or in ++ * case 'prev->active_mm == next->mm' through ++ * finish_task_switch()'s mmdrop(). ++ */ ++ switch_mm_irqs_off(prev->active_mm, next->mm, next); ++ ++ if (!prev->mm) { // from kernel ++ /* will mmdrop() in finish_task_switch(). */ ++ rq->prev_mm = prev->active_mm; ++ prev->active_mm = NULL; ++ } ++ } ++ prepare_lock_switch(rq, next); ++ ++ /* Here we just switch the register state and the stack. */ ++ switch_to(prev, next, prev); ++ barrier(); ++ ++ finish_task_switch(prev); ++} ++ ++/* ++ * nr_running, nr_uninterruptible and nr_context_switches: ++ * ++ * externally visible scheduler statistics: current number of runnable ++ * threads, total number of context switches performed since bootup. ++ */ ++unsigned long nr_running(void) ++{ ++ unsigned long i, sum = 0; ++ ++ for_each_online_cpu(i) ++ sum += cpu_rq(i)->nr_running; ++ ++ return sum; ++} ++ ++static unsigned long nr_uninterruptible(void) ++{ ++ unsigned long i, sum = 0; ++ ++ for_each_online_cpu(i) ++ sum += cpu_rq(i)->nr_uninterruptible; ++ ++ return sum; ++} ++ ++/* ++ * Check if only the current task is running on the CPU. ++ * ++ * Caution: this function does not check that the caller has disabled ++ * preemption, thus the result might have a time-of-check-to-time-of-use ++ * race. The caller is responsible to use it correctly, for example: ++ * ++ * - from a non-preemptible section (of course) ++ * ++ * - from a thread that is bound to a single CPU ++ * ++ * - in a loop with very short iterations (e.g. a polling loop) ++ */ ++bool single_task_running(void) ++{ ++ if (rq_load(raw_rq()) == 1) ++ return true; ++ else ++ return false; ++} ++EXPORT_SYMBOL(single_task_running); ++ ++unsigned long long nr_context_switches(void) ++{ ++ int cpu; ++ unsigned long long sum = 0; ++ ++ for_each_possible_cpu(cpu) ++ sum += cpu_rq(cpu)->nr_switches; ++ ++ return sum; ++} ++ ++/* ++ * Consumers of these two interfaces, like for example the cpufreq menu ++ * governor are using nonsensical data. Boosting frequency for a CPU that has ++ * IO-wait which might not even end up running the task when it does become ++ * runnable. ++ */ ++ ++unsigned long nr_iowait_cpu(int cpu) ++{ ++ return atomic_read(&cpu_rq(cpu)->nr_iowait); ++} ++ ++/* ++ * IO-wait accounting, and how its mostly bollocks (on SMP). ++ * ++ * The idea behind IO-wait account is to account the idle time that we could ++ * have spend running if it were not for IO. That is, if we were to improve the ++ * storage performance, we'd have a proportional reduction in IO-wait time. ++ * ++ * This all works nicely on UP, where, when a task blocks on IO, we account ++ * idle time as IO-wait, because if the storage were faster, it could've been ++ * running and we'd not be idle. ++ * ++ * This has been extended to SMP, by doing the same for each CPU. This however ++ * is broken. ++ * ++ * Imagine for instance the case where two tasks block on one CPU, only the one ++ * CPU will have IO-wait accounted, while the other has regular idle. Even ++ * though, if the storage were faster, both could've ran at the same time, ++ * utilising both CPUs. ++ * ++ * This means, that when looking globally, the current IO-wait accounting on ++ * SMP is a lower bound, by reason of under accounting. ++ * ++ * Worse, since the numbers are provided per CPU, they are sometimes ++ * interpreted per CPU, and that is nonsensical. A blocked task isn't strictly ++ * associated with any one particular CPU, it can wake to another CPU than it ++ * blocked on. This means the per CPU IO-wait number is meaningless. ++ * ++ * Task CPU affinities can make all that even more 'interesting'. ++ */ ++ ++unsigned long nr_iowait(void) ++{ ++ unsigned long cpu, sum = 0; ++ ++ for_each_possible_cpu(cpu) ++ sum += nr_iowait_cpu(cpu); ++ ++ return sum; ++} ++ ++unsigned long nr_active(void) ++{ ++ return nr_running() + nr_uninterruptible(); ++} ++ ++/* Variables and functions for calc_load */ ++static unsigned long calc_load_update; ++unsigned long avenrun[3]; ++EXPORT_SYMBOL(avenrun); ++ ++/** ++ * get_avenrun - get the load average array ++ * @loads: pointer to dest load array ++ * @offset: offset to add ++ * @shift: shift count to shift the result left ++ * ++ * These values are estimates at best, so no need for locking. ++ */ ++void get_avenrun(unsigned long *loads, unsigned long offset, int shift) ++{ ++ loads[0] = (avenrun[0] + offset) << shift; ++ loads[1] = (avenrun[1] + offset) << shift; ++ loads[2] = (avenrun[2] + offset) << shift; ++} ++ ++/* ++ * calc_load - update the avenrun load estimates every LOAD_FREQ seconds. ++ */ ++void calc_global_load(unsigned long ticks) ++{ ++ long active; ++ ++ if (time_before(jiffies, READ_ONCE(calc_load_update))) ++ return; ++ active = nr_active() * FIXED_1; ++ ++ avenrun[0] = calc_load(avenrun[0], EXP_1, active); ++ avenrun[1] = calc_load(avenrun[1], EXP_5, active); ++ avenrun[2] = calc_load(avenrun[2], EXP_15, active); ++ ++ calc_load_update = jiffies + LOAD_FREQ; ++} ++ ++/** ++ * fixed_power_int - compute: x^n, in O(log n) time ++ * ++ * @x: base of the power ++ * @frac_bits: fractional bits of @x ++ * @n: power to raise @x to. ++ * ++ * By exploiting the relation between the definition of the natural power ++ * function: x^n := x*x*...*x (x multiplied by itself for n times), and ++ * the binary encoding of numbers used by computers: n := \Sum n_i * 2^i, ++ * (where: n_i \elem {0, 1}, the binary vector representing n), ++ * we find: x^n := x^(\Sum n_i * 2^i) := \Prod x^(n_i * 2^i), which is ++ * of course trivially computable in O(log_2 n), the length of our binary ++ * vector. ++ */ ++static unsigned long ++fixed_power_int(unsigned long x, unsigned int frac_bits, unsigned int n) ++{ ++ unsigned long result = 1UL << frac_bits; ++ ++ if (n) { ++ for (;;) { ++ if (n & 1) { ++ result *= x; ++ result += 1UL << (frac_bits - 1); ++ result >>= frac_bits; ++ } ++ n >>= 1; ++ if (!n) ++ break; ++ x *= x; ++ x += 1UL << (frac_bits - 1); ++ x >>= frac_bits; ++ } ++ } ++ ++ return result; ++} ++ ++/* ++ * a1 = a0 * e + a * (1 - e) ++ * ++ * a2 = a1 * e + a * (1 - e) ++ * = (a0 * e + a * (1 - e)) * e + a * (1 - e) ++ * = a0 * e^2 + a * (1 - e) * (1 + e) ++ * ++ * a3 = a2 * e + a * (1 - e) ++ * = (a0 * e^2 + a * (1 - e) * (1 + e)) * e + a * (1 - e) ++ * = a0 * e^3 + a * (1 - e) * (1 + e + e^2) ++ * ++ * ... ++ * ++ * an = a0 * e^n + a * (1 - e) * (1 + e + ... + e^n-1) [1] ++ * = a0 * e^n + a * (1 - e) * (1 - e^n)/(1 - e) ++ * = a0 * e^n + a * (1 - e^n) ++ * ++ * [1] application of the geometric series: ++ * ++ * n 1 - x^(n+1) ++ * S_n := \Sum x^i = ------------- ++ * i=0 1 - x ++ */ ++unsigned long ++calc_load_n(unsigned long load, unsigned long exp, ++ unsigned long active, unsigned int n) ++{ ++ return calc_load(load, fixed_power_int(exp, FSHIFT, n), active); ++} ++ ++DEFINE_PER_CPU(struct kernel_stat, kstat); ++DEFINE_PER_CPU(struct kernel_cpustat, kernel_cpustat); ++ ++EXPORT_PER_CPU_SYMBOL(kstat); ++EXPORT_PER_CPU_SYMBOL(kernel_cpustat); ++ ++#ifdef CONFIG_PARAVIRT ++static inline u64 steal_ticks(u64 steal) ++{ ++ if (unlikely(steal > NSEC_PER_SEC)) ++ return div_u64(steal, TICK_NSEC); ++ ++ return __iter_div_u64_rem(steal, TICK_NSEC, &steal); ++} ++#endif ++ ++#ifndef nsecs_to_cputime ++# define nsecs_to_cputime(__nsecs) nsecs_to_jiffies(__nsecs) ++#endif ++ ++/* ++ * On each tick, add the number of nanoseconds to the unbanked variables and ++ * once one tick's worth has accumulated, account it allowing for accurate ++ * sub-tick accounting and totals. Use the TICK_APPROX_NS to match the way we ++ * deduct nanoseconds. ++ */ ++static void pc_idle_time(struct rq *rq, struct task_struct *idle, unsigned long ns) ++{ ++ u64 *cpustat = kcpustat_this_cpu->cpustat; ++ unsigned long ticks; ++ ++ if (atomic_read(&rq->nr_iowait) > 0) { ++ rq->iowait_ns += ns; ++ if (rq->iowait_ns >= JIFFY_NS) { ++ ticks = NS_TO_JIFFIES(rq->iowait_ns); ++ cpustat[CPUTIME_IOWAIT] += (__force u64)TICK_APPROX_NS * ticks; ++ rq->iowait_ns %= JIFFY_NS; ++ } ++ } else { ++ rq->idle_ns += ns; ++ if (rq->idle_ns >= JIFFY_NS) { ++ ticks = NS_TO_JIFFIES(rq->idle_ns); ++ cpustat[CPUTIME_IDLE] += (__force u64)TICK_APPROX_NS * ticks; ++ rq->idle_ns %= JIFFY_NS; ++ } ++ } ++ acct_update_integrals(idle); ++} ++ ++static void pc_system_time(struct rq *rq, struct task_struct *p, ++ int hardirq_offset, unsigned long ns) ++{ ++ u64 *cpustat = kcpustat_this_cpu->cpustat; ++ unsigned long ticks; ++ ++ p->stime_ns += ns; ++ if (p->stime_ns >= JIFFY_NS) { ++ ticks = NS_TO_JIFFIES(p->stime_ns); ++ p->stime_ns %= JIFFY_NS; ++ p->stime += (__force u64)TICK_APPROX_NS * ticks; ++ account_group_system_time(p, TICK_APPROX_NS * ticks); ++ } ++ p->sched_time += ns; ++ account_group_exec_runtime(p, ns); ++ ++ if (hardirq_count() - hardirq_offset) { ++ rq->irq_ns += ns; ++ if (rq->irq_ns >= JIFFY_NS) { ++ ticks = NS_TO_JIFFIES(rq->irq_ns); ++ cpustat[CPUTIME_IRQ] += (__force u64)TICK_APPROX_NS * ticks; ++ rq->irq_ns %= JIFFY_NS; ++ } ++ } else if (in_serving_softirq()) { ++ rq->softirq_ns += ns; ++ if (rq->softirq_ns >= JIFFY_NS) { ++ ticks = NS_TO_JIFFIES(rq->softirq_ns); ++ cpustat[CPUTIME_SOFTIRQ] += (__force u64)TICK_APPROX_NS * ticks; ++ rq->softirq_ns %= JIFFY_NS; ++ } ++ } else { ++ rq->system_ns += ns; ++ if (rq->system_ns >= JIFFY_NS) { ++ ticks = NS_TO_JIFFIES(rq->system_ns); ++ cpustat[CPUTIME_SYSTEM] += (__force u64)TICK_APPROX_NS * ticks; ++ rq->system_ns %= JIFFY_NS; ++ } ++ } ++ acct_update_integrals(p); ++} ++ ++static void pc_user_time(struct rq *rq, struct task_struct *p, unsigned long ns) ++{ ++ u64 *cpustat = kcpustat_this_cpu->cpustat; ++ unsigned long ticks; ++ ++ p->utime_ns += ns; ++ if (p->utime_ns >= JIFFY_NS) { ++ ticks = NS_TO_JIFFIES(p->utime_ns); ++ p->utime_ns %= JIFFY_NS; ++ p->utime += (__force u64)TICK_APPROX_NS * ticks; ++ account_group_user_time(p, TICK_APPROX_NS * ticks); ++ } ++ p->sched_time += ns; ++ account_group_exec_runtime(p, ns); ++ ++ if (this_cpu_ksoftirqd() == p) { ++ /* ++ * ksoftirqd time do not get accounted in cpu_softirq_time. ++ * So, we have to handle it separately here. ++ */ ++ rq->softirq_ns += ns; ++ if (rq->softirq_ns >= JIFFY_NS) { ++ ticks = NS_TO_JIFFIES(rq->softirq_ns); ++ cpustat[CPUTIME_SOFTIRQ] += (__force u64)TICK_APPROX_NS * ticks; ++ rq->softirq_ns %= JIFFY_NS; ++ } ++ } ++ ++ if (task_nice(p) > 0 || idleprio_task(p)) { ++ rq->nice_ns += ns; ++ if (rq->nice_ns >= JIFFY_NS) { ++ ticks = NS_TO_JIFFIES(rq->nice_ns); ++ cpustat[CPUTIME_NICE] += (__force u64)TICK_APPROX_NS * ticks; ++ rq->nice_ns %= JIFFY_NS; ++ } ++ } else { ++ rq->user_ns += ns; ++ if (rq->user_ns >= JIFFY_NS) { ++ ticks = NS_TO_JIFFIES(rq->user_ns); ++ cpustat[CPUTIME_USER] += (__force u64)TICK_APPROX_NS * ticks; ++ rq->user_ns %= JIFFY_NS; ++ } ++ } ++ acct_update_integrals(p); ++} ++ ++/* ++ * This is called on clock ticks. ++ * Bank in p->sched_time the ns elapsed since the last tick or switch. ++ * CPU scheduler quota accounting is also performed here in microseconds. ++ */ ++static void update_cpu_clock_tick(struct rq *rq, struct task_struct *p) ++{ ++ s64 account_ns = rq->niffies - p->last_ran; ++ struct task_struct *idle = rq->idle; ++ ++ /* Accurate tick timekeeping */ ++ if (user_mode(get_irq_regs())) ++ pc_user_time(rq, p, account_ns); ++ else if (p != idle || (irq_count() != HARDIRQ_OFFSET)) { ++ pc_system_time(rq, p, HARDIRQ_OFFSET, account_ns); ++ } else ++ pc_idle_time(rq, idle, account_ns); ++ ++ /* time_slice accounting is done in usecs to avoid overflow on 32bit */ ++ if (p->policy != SCHED_FIFO && p != idle) ++ p->time_slice -= NS_TO_US(account_ns); ++ ++ p->last_ran = rq->niffies; ++} ++ ++/* ++ * This is called on context switches. ++ * Bank in p->sched_time the ns elapsed since the last tick or switch. ++ * CPU scheduler quota accounting is also performed here in microseconds. ++ */ ++static void update_cpu_clock_switch(struct rq *rq, struct task_struct *p) ++{ ++ s64 account_ns = rq->niffies - p->last_ran; ++ struct task_struct *idle = rq->idle; ++ ++ /* Accurate subtick timekeeping */ ++ if (p != idle) ++ pc_user_time(rq, p, account_ns); ++ else ++ pc_idle_time(rq, idle, account_ns); ++ ++ /* time_slice accounting is done in usecs to avoid overflow on 32bit */ ++ if (p->policy != SCHED_FIFO && p != idle) ++ p->time_slice -= NS_TO_US(account_ns); ++} ++ ++/* ++ * Return any ns on the sched_clock that have not yet been accounted in ++ * @p in case that task is currently running. ++ * ++ * Called with task_rq_lock(p) held. ++ */ ++static inline u64 do_task_delta_exec(struct task_struct *p, struct rq *rq) ++{ ++ u64 ns = 0; ++ ++ /* ++ * Must be ->curr _and_ ->on_rq. If dequeued, we would ++ * project cycles that may never be accounted to this ++ * thread, breaking clock_gettime(). ++ */ ++ if (p == rq->curr && task_on_rq_queued(p)) { ++ update_clocks(rq); ++ ns = rq->niffies - p->last_ran; ++ } ++ ++ return ns; ++} ++ ++/* ++ * Return accounted runtime for the task. ++ * Return separately the current's pending runtime that have not been ++ * accounted yet. ++ * ++ */ ++unsigned long long task_sched_runtime(struct task_struct *p) ++{ ++ struct rq_flags rf; ++ struct rq *rq; ++ u64 ns; ++ ++#if defined(CONFIG_64BIT) && defined(CONFIG_SMP) ++ /* ++ * 64-bit doesn't need locks to atomically read a 64-bit value. ++ * So we have a optimisation chance when the task's delta_exec is 0. ++ * Reading ->on_cpu is racy, but this is ok. ++ * ++ * If we race with it leaving CPU, we'll take a lock. So we're correct. ++ * If we race with it entering CPU, unaccounted time is 0. This is ++ * indistinguishable from the read occurring a few cycles earlier. ++ * If we see ->on_cpu without ->on_rq, the task is leaving, and has ++ * been accounted, so we're correct here as well. ++ */ ++ if (!p->on_cpu || !task_on_rq_queued(p)) ++ return tsk_seruntime(p); ++#endif ++ ++ rq = task_rq_lock(p, &rf); ++ ns = p->sched_time + do_task_delta_exec(p, rq); ++ task_rq_unlock(rq, p, &rf); ++ ++ return ns; ++} ++ ++/* ++ * Functions to test for when SCHED_ISO tasks have used their allocated ++ * quota as real time scheduling and convert them back to SCHED_NORMAL. All ++ * data is modified only by the local runqueue during scheduler_tick with ++ * interrupts disabled. ++ */ ++ ++/* ++ * Test if SCHED_ISO tasks have run longer than their alloted period as RT ++ * tasks and set the refractory flag if necessary. There is 10% hysteresis ++ * for unsetting the flag. 115/128 is ~90/100 as a fast shift instead of a ++ * slow division. ++ */ ++static inline void iso_tick(struct rq *rq) ++{ ++ rq->iso_ticks = rq->iso_ticks * (ISO_PERIOD - 1) / ISO_PERIOD; ++ rq->iso_ticks += 100; ++ if (rq->iso_ticks > ISO_PERIOD * sched_iso_cpu) { ++ rq->iso_refractory = true; ++ if (unlikely(rq->iso_ticks > ISO_PERIOD * 100)) ++ rq->iso_ticks = ISO_PERIOD * 100; ++ } ++} ++ ++/* No SCHED_ISO task was running so decrease rq->iso_ticks */ ++static inline void no_iso_tick(struct rq *rq, int ticks) ++{ ++ if (rq->iso_ticks > 0 || rq->iso_refractory) { ++ rq->iso_ticks = rq->iso_ticks * (ISO_PERIOD - ticks) / ISO_PERIOD; ++ if (rq->iso_ticks < ISO_PERIOD * (sched_iso_cpu * 115 / 128)) { ++ rq->iso_refractory = false; ++ if (unlikely(rq->iso_ticks < 0)) ++ rq->iso_ticks = 0; ++ } ++ } ++} ++ ++/* This manages tasks that have run out of timeslice during a scheduler_tick */ ++static void task_running_tick(struct rq *rq) ++{ ++ struct task_struct *p = rq->curr; ++ ++ /* ++ * If a SCHED_ISO task is running we increment the iso_ticks. In ++ * order to prevent SCHED_ISO tasks from causing starvation in the ++ * presence of true RT tasks we account those as iso_ticks as well. ++ */ ++ if (rt_task(p) || task_running_iso(p)) ++ iso_tick(rq); ++ else ++ no_iso_tick(rq, 1); ++ ++ /* SCHED_FIFO tasks never run out of timeslice. */ ++ if (p->policy == SCHED_FIFO) ++ return; ++ ++ if (iso_task(p)) { ++ if (task_running_iso(p)) { ++ if (rq->iso_refractory) { ++ /* ++ * SCHED_ISO task is running as RT and limit ++ * has been hit. Force it to reschedule as ++ * SCHED_NORMAL by zeroing its time_slice ++ */ ++ p->time_slice = 0; ++ } ++ } else if (!rq->iso_refractory) { ++ /* Can now run again ISO. Reschedule to pick up prio */ ++ goto out_resched; ++ } ++ } ++ ++ /* ++ * Tasks that were scheduled in the first half of a tick are not ++ * allowed to run into the 2nd half of the next tick if they will ++ * run out of time slice in the interim. Otherwise, if they have ++ * less than RESCHED_US μs of time slice left they will be rescheduled. ++ * Dither is used as a backup for when hrexpiry is disabled or high res ++ * timers not configured in. ++ */ ++ if (p->time_slice - rq->dither >= RESCHED_US) ++ return; ++out_resched: ++ rq_lock(rq); ++ __set_tsk_resched(p); ++ rq_unlock(rq); ++} ++ ++static inline void task_tick(struct rq *rq) ++{ ++ if (!rq_idle(rq)) ++ task_running_tick(rq); ++ else if (rq->last_jiffy > rq->last_scheduler_tick) ++ no_iso_tick(rq, rq->last_jiffy - rq->last_scheduler_tick); ++} ++ ++#ifdef CONFIG_NO_HZ_FULL ++/* ++ * We can stop the timer tick any time highres timers are active since ++ * we rely entirely on highres timeouts for task expiry rescheduling. ++ */ ++static void sched_stop_tick(struct rq *rq, int cpu) ++{ ++ if (!hrexpiry_enabled(rq)) ++ return; ++ if (!tick_nohz_full_enabled()) ++ return; ++ if (!tick_nohz_full_cpu(cpu)) ++ return; ++ tick_nohz_dep_clear_cpu(cpu, TICK_DEP_BIT_SCHED); ++} ++ ++static inline void sched_start_tick(struct rq *rq, int cpu) ++{ ++ tick_nohz_dep_set_cpu(cpu, TICK_DEP_BIT_SCHED); ++} ++ ++struct tick_work { ++ int cpu; ++ atomic_t state; ++ struct delayed_work work; ++}; ++/* Values for ->state, see diagram below. */ ++#define TICK_SCHED_REMOTE_OFFLINE 0 ++#define TICK_SCHED_REMOTE_OFFLINING 1 ++#define TICK_SCHED_REMOTE_RUNNING 2 ++ ++/* ++ * State diagram for ->state: ++ * ++ * ++ * TICK_SCHED_REMOTE_OFFLINE ++ * | ^ ++ * | | ++ * | | sched_tick_remote() ++ * | | ++ * | | ++ * +--TICK_SCHED_REMOTE_OFFLINING ++ * | ^ ++ * | | ++ * sched_tick_start() | | sched_tick_stop() ++ * | | ++ * V | ++ * TICK_SCHED_REMOTE_RUNNING ++ * ++ * ++ * Other transitions get WARN_ON_ONCE(), except that sched_tick_remote() ++ * and sched_tick_start() are happy to leave the state in RUNNING. ++ */ ++ ++static struct tick_work __percpu *tick_work_cpu; ++ ++static void sched_tick_remote(struct work_struct *work) ++{ ++ struct delayed_work *dwork = to_delayed_work(work); ++ struct tick_work *twork = container_of(dwork, struct tick_work, work); ++ int cpu = twork->cpu; ++ struct rq *rq = cpu_rq(cpu); ++ struct task_struct *curr; ++ u64 delta; ++ int os; ++ ++ /* ++ * Handle the tick only if it appears the remote CPU is running in full ++ * dynticks mode. The check is racy by nature, but missing a tick or ++ * having one too much is no big deal because the scheduler tick updates ++ * statistics and checks timeslices in a time-independent way, regardless ++ * of when exactly it is running. ++ */ ++ if (idle_cpu(cpu) || !tick_nohz_tick_stopped_cpu(cpu)) ++ goto out_requeue; ++ ++ rq_lock_irq(rq); ++ curr = rq->curr; ++ if (is_idle_task(curr) || cpu_is_offline(cpu)) ++ goto out_unlock; ++ ++ update_rq_clock(rq); ++ delta = rq_clock_task(rq) - curr->last_ran; ++ ++ /* ++ * Make sure the next tick runs within a reasonable ++ * amount of time. ++ */ ++ WARN_ON_ONCE(delta > (u64)NSEC_PER_SEC * 3); ++ task_tick(rq); ++ ++out_unlock: ++ rq_unlock_irq(rq, NULL); ++ ++out_requeue: ++ /* ++ * Run the remote tick once per second (1Hz). This arbitrary ++ * frequency is large enough to avoid overload but short enough ++ * to keep scheduler internal stats reasonably up to date. But ++ * first update state to reflect hotplug activity if required. ++ */ ++ os = atomic_fetch_add_unless(&twork->state, -1, TICK_SCHED_REMOTE_RUNNING); ++ WARN_ON_ONCE(os == TICK_SCHED_REMOTE_OFFLINE); ++ if (os == TICK_SCHED_REMOTE_RUNNING) ++ queue_delayed_work(system_unbound_wq, dwork, HZ); ++} ++ ++static void sched_tick_start(int cpu) ++{ ++ struct tick_work *twork; ++ int os; ++ ++ if (housekeeping_cpu(cpu, HK_FLAG_TICK)) ++ return; ++ ++ WARN_ON_ONCE(!tick_work_cpu); ++ ++ twork = per_cpu_ptr(tick_work_cpu, cpu); ++ os = atomic_xchg(&twork->state, TICK_SCHED_REMOTE_RUNNING); ++ WARN_ON_ONCE(os == TICK_SCHED_REMOTE_RUNNING); ++ if (os == TICK_SCHED_REMOTE_OFFLINE) { ++ twork->cpu = cpu; ++ INIT_DELAYED_WORK(&twork->work, sched_tick_remote); ++ queue_delayed_work(system_unbound_wq, &twork->work, HZ); ++ } ++} ++ ++#ifdef CONFIG_HOTPLUG_CPU ++static void sched_tick_stop(int cpu) ++{ ++ struct tick_work *twork; ++ int os; ++ ++ if (housekeeping_cpu(cpu, HK_FLAG_TICK)) ++ return; ++ ++ WARN_ON_ONCE(!tick_work_cpu); ++ ++ twork = per_cpu_ptr(tick_work_cpu, cpu); ++ /* There cannot be competing actions, but don't rely on stop-machine. */ ++ os = atomic_xchg(&twork->state, TICK_SCHED_REMOTE_OFFLINING); ++ WARN_ON_ONCE(os != TICK_SCHED_REMOTE_RUNNING); ++ /* Don't cancel, as this would mess up the state machine. */ ++} ++#endif /* CONFIG_HOTPLUG_CPU */ ++ ++int __init sched_tick_offload_init(void) ++{ ++ tick_work_cpu = alloc_percpu(struct tick_work); ++ BUG_ON(!tick_work_cpu); ++ return 0; ++} ++ ++#else /* !CONFIG_NO_HZ_FULL */ ++static inline void sched_stop_tick(struct rq *rq, int cpu) {} ++static inline void sched_start_tick(struct rq *rq, int cpu) {} ++static inline void sched_tick_start(int cpu) { } ++static inline void sched_tick_stop(int cpu) { } ++#endif ++ ++/* ++ * This function gets called by the timer code, with HZ frequency. ++ * We call it with interrupts disabled. ++ */ ++void scheduler_tick(void) ++{ ++ int cpu __maybe_unused = smp_processor_id(); ++ struct rq *rq = cpu_rq(cpu); ++ ++ sched_clock_tick(); ++ update_clocks(rq); ++ update_load_avg(rq, 0); ++ update_cpu_clock_tick(rq, rq->curr); ++ task_tick(rq); ++ rq->last_scheduler_tick = rq->last_jiffy; ++ rq->last_tick = rq->clock; ++ psi_task_tick(rq); ++ perf_event_task_tick(); ++ sched_stop_tick(rq, cpu); ++} ++ ++#if defined(CONFIG_PREEMPTION) && (defined(CONFIG_DEBUG_PREEMPT) || \ ++ defined(CONFIG_TRACE_PREEMPT_TOGGLE)) ++/* ++ * If the value passed in is equal to the current preempt count ++ * then we just disabled preemption. Start timing the latency. ++ */ ++static inline void preempt_latency_start(int val) ++{ ++ if (preempt_count() == val) { ++ unsigned long ip = get_lock_parent_ip(); ++#ifdef CONFIG_DEBUG_PREEMPT ++ current->preempt_disable_ip = ip; ++#endif ++ trace_preempt_off(CALLER_ADDR0, ip); ++ } ++} ++ ++void preempt_count_add(int val) ++{ ++#ifdef CONFIG_DEBUG_PREEMPT ++ /* ++ * Underflow? ++ */ ++ if (DEBUG_LOCKS_WARN_ON((preempt_count() < 0))) ++ return; ++#endif ++ __preempt_count_add(val); ++#ifdef CONFIG_DEBUG_PREEMPT ++ /* ++ * Spinlock count overflowing soon? ++ */ ++ DEBUG_LOCKS_WARN_ON((preempt_count() & PREEMPT_MASK) >= ++ PREEMPT_MASK - 10); ++#endif ++ preempt_latency_start(val); ++} ++EXPORT_SYMBOL(preempt_count_add); ++NOKPROBE_SYMBOL(preempt_count_add); ++ ++/* ++ * If the value passed in equals to the current preempt count ++ * then we just enabled preemption. Stop timing the latency. ++ */ ++static inline void preempt_latency_stop(int val) ++{ ++ if (preempt_count() == val) ++ trace_preempt_on(CALLER_ADDR0, get_lock_parent_ip()); ++} ++ ++void preempt_count_sub(int val) ++{ ++#ifdef CONFIG_DEBUG_PREEMPT ++ /* ++ * Underflow? ++ */ ++ if (DEBUG_LOCKS_WARN_ON(val > preempt_count())) ++ return; ++ /* ++ * Is the spinlock portion underflowing? ++ */ ++ if (DEBUG_LOCKS_WARN_ON((val < PREEMPT_MASK) && ++ !(preempt_count() & PREEMPT_MASK))) ++ return; ++#endif ++ ++ preempt_latency_stop(val); ++ __preempt_count_sub(val); ++} ++EXPORT_SYMBOL(preempt_count_sub); ++NOKPROBE_SYMBOL(preempt_count_sub); ++ ++#else ++static inline void preempt_latency_start(int val) { } ++static inline void preempt_latency_stop(int val) { } ++#endif ++ ++static inline unsigned long get_preempt_disable_ip(struct task_struct *p) ++{ ++#ifdef CONFIG_DEBUG_PREEMPT ++ return p->preempt_disable_ip; ++#else ++ return 0; ++#endif ++} ++ ++/* ++ * The time_slice is only refilled when it is empty and that is when we set a ++ * new deadline. Make sure update_clocks has been called recently to update ++ * rq->niffies. ++ */ ++static void time_slice_expired(struct task_struct *p, struct rq *rq) ++{ ++ p->time_slice = timeslice(); ++ p->deadline = rq->niffies + task_deadline_diff(p); ++#ifdef CONFIG_SMT_NICE ++ if (!p->mm) ++ p->smt_bias = 0; ++ else if (rt_task(p)) ++ p->smt_bias = 1 << 30; ++ else if (task_running_iso(p)) ++ p->smt_bias = 1 << 29; ++ else if (idleprio_task(p)) { ++ if (task_running_idle(p)) ++ p->smt_bias = 0; ++ else ++ p->smt_bias = 1; ++ } else if (--p->smt_bias < 1) ++ p->smt_bias = MAX_PRIO - p->static_prio; ++#endif ++} ++ ++/* ++ * Timeslices below RESCHED_US are considered as good as expired as there's no ++ * point rescheduling when there's so little time left. SCHED_BATCH tasks ++ * have been flagged be not latency sensitive and likely to be fully CPU ++ * bound so every time they're rescheduled they have their time_slice ++ * refilled, but get a new later deadline to have little effect on ++ * SCHED_NORMAL tasks. ++ ++ */ ++static inline void check_deadline(struct task_struct *p, struct rq *rq) ++{ ++ if (p->time_slice < RESCHED_US || batch_task(p)) ++ time_slice_expired(p, rq); ++} ++ ++/* ++ * Task selection with skiplists is a simple matter of picking off the first ++ * task in the sorted list, an O(1) operation. The lookup is amortised O(1) ++ * being bound to the number of processors. ++ * ++ * Runqueues are selectively locked based on their unlocked data and then ++ * unlocked if not needed. At most 3 locks will be held at any time and are ++ * released as soon as they're no longer needed. All balancing between CPUs ++ * is thus done here in an extremely simple first come best fit manner. ++ * ++ * This iterates over runqueues in cache locality order. In interactive mode ++ * it iterates over all CPUs and finds the task with the best key/deadline. ++ * In non-interactive mode it will only take a task if it's from the current ++ * runqueue or a runqueue with more tasks than the current one with a better ++ * key/deadline. ++ */ ++#ifdef CONFIG_SMP ++static inline struct task_struct ++*earliest_deadline_task(struct rq *rq, int cpu, struct task_struct *idle) ++{ ++ struct rq *locked = NULL, *chosen = NULL; ++ struct task_struct *edt = idle; ++ int i, best_entries = 0; ++ u64 best_key = ~0ULL; ++ ++ for (i = 0; i < total_runqueues; i++) { ++ struct rq *other_rq = rq_order(rq, i); ++ skiplist_node *next; ++ int entries; ++ ++ entries = other_rq->sl->entries; ++ /* ++ * Check for queued entres lockless first. The local runqueue ++ * is locked so entries will always be accurate. ++ */ ++ if (!sched_interactive) { ++ /* ++ * Don't reschedule balance across nodes unless the CPU ++ * is idle. ++ */ ++ if (edt != idle && rq->cpu_locality[other_rq->cpu] > LOCALITY_SMP) ++ break; ++ if (entries <= best_entries) ++ continue; ++ } else if (!entries) ++ continue; ++ ++ /* if (i) implies other_rq != rq */ ++ if (i) { ++ /* Check for best id queued lockless first */ ++ if (other_rq->best_key >= best_key) ++ continue; ++ ++ if (unlikely(!trylock_rq(rq, other_rq))) ++ continue; ++ ++ /* Need to reevaluate entries after locking */ ++ entries = other_rq->sl->entries; ++ if (unlikely(!entries)) { ++ unlock_rq(other_rq); ++ continue; ++ } ++ } ++ ++ next = other_rq->node; ++ /* ++ * In interactive mode we check beyond the best entry on other ++ * runqueues if we can't get the best for smt or affinity ++ * reasons. ++ */ ++ while ((next = next->next[0]) != other_rq->node) { ++ struct task_struct *p; ++ u64 key = next->key; ++ ++ /* Reevaluate key after locking */ ++ if (key >= best_key) ++ break; ++ ++ p = next->value; ++ if (!smt_schedule(p, rq)) { ++ if (i && !sched_interactive) ++ break; ++ continue; ++ } ++ ++ if (sched_other_cpu(p, cpu)) { ++ if (sched_interactive || !i) ++ continue; ++ break; ++ } ++ /* Make sure affinity is ok */ ++ if (i) { ++ /* From this point on p is the best so far */ ++ if (locked) ++ unlock_rq(locked); ++ chosen = locked = other_rq; ++ } ++ best_entries = entries; ++ best_key = key; ++ edt = p; ++ break; ++ } ++ /* rq->preempting is a hint only as the state may have changed ++ * since it was set with the resched call but if we have met ++ * the condition we can break out here. */ ++ if (edt == rq->preempting) ++ break; ++ if (i && other_rq != chosen) ++ unlock_rq(other_rq); ++ } ++ ++ if (likely(edt != idle)) ++ take_task(rq, cpu, edt); ++ ++ if (locked) ++ unlock_rq(locked); ++ ++ rq->preempting = NULL; ++ ++ return edt; ++} ++#else /* CONFIG_SMP */ ++static inline struct task_struct ++*earliest_deadline_task(struct rq *rq, int cpu, struct task_struct *idle) ++{ ++ struct task_struct *edt; ++ ++ if (unlikely(!rq->sl->entries)) ++ return idle; ++ edt = rq->node->next[0]->value; ++ take_task(rq, cpu, edt); ++ return edt; ++} ++#endif /* CONFIG_SMP */ ++ ++/* ++ * Print scheduling while atomic bug: ++ */ ++static noinline void __schedule_bug(struct task_struct *prev) ++{ ++ /* Save this before calling printk(), since that will clobber it */ ++ unsigned long preempt_disable_ip = get_preempt_disable_ip(current); ++ ++ if (oops_in_progress) ++ return; ++ ++ printk(KERN_ERR "BUG: scheduling while atomic: %s/%d/0x%08x\n", ++ prev->comm, prev->pid, preempt_count()); ++ ++ debug_show_held_locks(prev); ++ print_modules(); ++ if (irqs_disabled()) ++ print_irqtrace_events(prev); ++ if (IS_ENABLED(CONFIG_DEBUG_PREEMPT) ++ && in_atomic_preempt_off()) { ++ pr_err("Preemption disabled at:"); ++ print_ip_sym(preempt_disable_ip); ++ pr_cont("\n"); ++ } ++ dump_stack(); ++ add_taint(TAINT_WARN, LOCKDEP_STILL_OK); ++} ++ ++/* ++ * Various schedule()-time debugging checks and statistics: ++ */ ++static inline void schedule_debug(struct task_struct *prev, bool preempt) ++{ ++#ifdef CONFIG_SCHED_STACK_END_CHECK ++ if (task_stack_end_corrupted(prev)) ++ panic("corrupted stack end detected inside scheduler\n"); ++#endif ++ ++#ifdef CONFIG_DEBUG_ATOMIC_SLEEP ++ if (!preempt && prev->state && prev->non_block_count) { ++ printk(KERN_ERR "BUG: scheduling in a non-blocking section: %s/%d/%i\n", ++ prev->comm, prev->pid, prev->non_block_count); ++ dump_stack(); ++ add_taint(TAINT_WARN, LOCKDEP_STILL_OK); ++ } ++#endif ++ ++ if (unlikely(in_atomic_preempt_off())) { ++ __schedule_bug(prev); ++ preempt_count_set(PREEMPT_DISABLED); ++ } ++ rcu_sleep_check(); ++ ++ profile_hit(SCHED_PROFILING, __builtin_return_address(0)); ++ ++ schedstat_inc(this_rq()->sched_count); ++} ++ ++/* ++ * The currently running task's information is all stored in rq local data ++ * which is only modified by the local CPU. ++ */ ++static inline void set_rq_task(struct rq *rq, struct task_struct *p) ++{ ++ if (p == rq->idle || p->policy == SCHED_FIFO) ++ hrexpiry_clear(rq); ++ else ++ hrexpiry_start(rq, US_TO_NS(p->time_slice)); ++ if (rq->clock - rq->last_tick > HALF_JIFFY_NS) ++ rq->dither = 0; ++ else ++ rq->dither = rq_dither(rq); ++ ++ rq->rq_deadline = p->deadline; ++ rq->rq_prio = p->prio; ++#ifdef CONFIG_SMT_NICE ++ rq->rq_mm = p->mm; ++ rq->rq_smt_bias = p->smt_bias; ++#endif ++} ++ ++#ifdef CONFIG_SMT_NICE ++static void check_no_siblings(struct rq __maybe_unused *this_rq) {} ++static void wake_no_siblings(struct rq __maybe_unused *this_rq) {} ++static void (*check_siblings)(struct rq *this_rq) = &check_no_siblings; ++static void (*wake_siblings)(struct rq *this_rq) = &wake_no_siblings; ++ ++/* Iterate over smt siblings when we've scheduled a process on cpu and decide ++ * whether they should continue running or be descheduled. */ ++static void check_smt_siblings(struct rq *this_rq) ++{ ++ int other_cpu; ++ ++ for_each_cpu(other_cpu, &this_rq->thread_mask) { ++ struct task_struct *p; ++ struct rq *rq; ++ ++ rq = cpu_rq(other_cpu); ++ if (rq_idle(rq)) ++ continue; ++ p = rq->curr; ++ if (!smt_schedule(p, this_rq)) ++ resched_curr(rq); ++ } ++} ++ ++static void wake_smt_siblings(struct rq *this_rq) ++{ ++ int other_cpu; ++ ++ for_each_cpu(other_cpu, &this_rq->thread_mask) { ++ struct rq *rq; ++ ++ rq = cpu_rq(other_cpu); ++ if (rq_idle(rq)) ++ resched_idle(rq); ++ } ++} ++#else ++static void check_siblings(struct rq __maybe_unused *this_rq) {} ++static void wake_siblings(struct rq __maybe_unused *this_rq) {} ++#endif ++ ++/* ++ * schedule() is the main scheduler function. ++ * ++ * The main means of driving the scheduler and thus entering this function are: ++ * ++ * 1. Explicit blocking: mutex, semaphore, waitqueue, etc. ++ * ++ * 2. TIF_NEED_RESCHED flag is checked on interrupt and userspace return ++ * paths. For example, see arch/x86/entry_64.S. ++ * ++ * To drive preemption between tasks, the scheduler sets the flag in timer ++ * interrupt handler scheduler_tick(). ++ * ++ * 3. Wakeups don't really cause entry into schedule(). They add a ++ * task to the run-queue and that's it. ++ * ++ * Now, if the new task added to the run-queue preempts the current ++ * task, then the wakeup sets TIF_NEED_RESCHED and schedule() gets ++ * called on the nearest possible occasion: ++ * ++ * - If the kernel is preemptible (CONFIG_PREEMPTION=y): ++ * ++ * - in syscall or exception context, at the next outmost ++ * preempt_enable(). (this might be as soon as the wake_up()'s ++ * spin_unlock()!) ++ * ++ * - in IRQ context, return from interrupt-handler to ++ * preemptible context ++ * ++ * - If the kernel is not preemptible (CONFIG_PREEMPTION is not set) ++ * then at the next: ++ * ++ * - cond_resched() call ++ * - explicit schedule() call ++ * - return from syscall or exception to user-space ++ * - return from interrupt-handler to user-space ++ * ++ * WARNING: must be called with preemption disabled! ++ */ ++static void __sched notrace __schedule(bool preempt) ++{ ++ struct task_struct *prev, *next, *idle; ++ unsigned long *switch_count; ++ bool deactivate = false; ++ struct rq *rq; ++ u64 niffies; ++ int cpu; ++ ++ cpu = smp_processor_id(); ++ rq = cpu_rq(cpu); ++ prev = rq->curr; ++ idle = rq->idle; ++ ++ schedule_debug(prev, preempt); ++ ++ local_irq_disable(); ++ rcu_note_context_switch(preempt); ++ ++ /* ++ * Make sure that signal_pending_state()->signal_pending() below ++ * can't be reordered with __set_current_state(TASK_INTERRUPTIBLE) ++ * done by the caller to avoid the race with signal_wake_up(). ++ * ++ * The membarrier system call requires a full memory barrier ++ * after coming from user-space, before storing to rq->curr. ++ */ ++ rq_lock(rq); ++ smp_mb__after_spinlock(); ++#ifdef CONFIG_SMP ++ if (rq->preempt) { ++ /* ++ * Make sure resched_curr hasn't triggered a preemption ++ * locklessly on a task that has since scheduled away. Spurious ++ * wakeup of idle is okay though. ++ */ ++ if (unlikely(preempt && prev != idle && !test_tsk_need_resched(prev))) { ++ rq->preempt = NULL; ++ clear_preempt_need_resched(); ++ rq_unlock_irq(rq, NULL); ++ return; ++ } ++ rq->preempt = NULL; ++ } ++#endif ++ ++ switch_count = &prev->nivcsw; ++ if (!preempt && prev->state) { ++ if (signal_pending_state(prev->state, prev)) { ++ prev->state = TASK_RUNNING; ++ } else { ++ deactivate = true; ++ ++ if (prev->in_iowait) { ++ atomic_inc(&rq->nr_iowait); ++ delayacct_blkio_start(); ++ } ++ } ++ switch_count = &prev->nvcsw; ++ } ++ ++ /* ++ * Store the niffy value here for use by the next task's last_ran ++ * below to avoid losing niffies due to update_clocks being called ++ * again after this point. ++ */ ++ update_clocks(rq); ++ niffies = rq->niffies; ++ update_cpu_clock_switch(rq, prev); ++ ++ clear_tsk_need_resched(prev); ++ clear_preempt_need_resched(); ++ ++ if (idle != prev) { ++ check_deadline(prev, rq); ++ return_task(prev, rq, cpu, deactivate); ++ } ++ ++ next = earliest_deadline_task(rq, cpu, idle); ++ if (likely(next->prio != PRIO_LIMIT)) ++ clear_cpuidle_map(cpu); ++ else { ++ set_cpuidle_map(cpu); ++ update_load_avg(rq, 0); ++ } ++ ++ set_rq_task(rq, next); ++ next->last_ran = niffies; ++ ++ if (likely(prev != next)) { ++ /* ++ * Don't reschedule an idle task or deactivated tasks ++ */ ++ if (prev == idle) { ++ rq->nr_running++; ++ if (rt_task(next)) ++ rq->rt_nr_running++; ++ } else if (!deactivate) ++ resched_suitable_idle(prev); ++ if (unlikely(next == idle)) { ++ rq->nr_running--; ++ if (rt_task(prev)) ++ rq->rt_nr_running--; ++ wake_siblings(rq); ++ } else ++ check_siblings(rq); ++ rq->nr_switches++; ++ /* ++ * RCU users of rcu_dereference(rq->curr) may not see ++ * changes to task_struct made by pick_next_task(). ++ */ ++ RCU_INIT_POINTER(rq->curr, next); ++ /* ++ * The membarrier system call requires each architecture ++ * to have a full memory barrier after updating ++ * rq->curr, before returning to user-space. ++ * ++ * Here are the schemes providing that barrier on the ++ * various architectures: ++ * - mm ? switch_mm() : mmdrop() for x86, s390, sparc, PowerPC. ++ * switch_mm() rely on membarrier_arch_switch_mm() on PowerPC. ++ * - finish_lock_switch() for weakly-ordered ++ * architectures where spin_unlock is a full barrier, ++ * - switch_to() for arm64 (weakly-ordered, spin_unlock ++ * is a RELEASE barrier), ++ */ ++ ++*switch_count; ++ ++ trace_sched_switch(preempt, prev, next); ++ context_switch(rq, prev, next); /* unlocks the rq */ ++ } else { ++ check_siblings(rq); ++ rq_unlock(rq); ++ do_pending_softirq(rq, next); ++ local_irq_enable(); ++ } ++} ++ ++void __noreturn do_task_dead(void) ++{ ++ /* Causes final put_task_struct in finish_task_switch(). */ ++ set_special_state(TASK_DEAD); ++ ++ /* Tell freezer to ignore us: */ ++ current->flags |= PF_NOFREEZE; ++ __schedule(false); ++ BUG(); ++ ++ /* Avoid "noreturn function does return" - but don't continue if BUG() is a NOP: */ ++ for (;;) ++ cpu_relax(); ++} ++ ++static inline void sched_submit_work(struct task_struct *tsk) ++{ ++ if (!tsk->state) ++ return; ++ ++ /* ++ * If a worker went to sleep, notify and ask workqueue whether ++ * it wants to wake up a task to maintain concurrency. ++ * As this function is called inside the schedule() context, ++ * we disable preemption to avoid it calling schedule() again ++ * in the possible wakeup of a kworker. ++ */ ++ if (tsk->flags & (PF_WQ_WORKER | PF_IO_WORKER)) { ++ preempt_disable(); ++ if (tsk->flags & PF_WQ_WORKER) ++ wq_worker_sleeping(tsk); ++ else ++ io_wq_worker_sleeping(tsk); ++ preempt_enable_no_resched(); ++ } ++ ++ if (tsk_is_pi_blocked(tsk)) ++ return; ++ ++ /* ++ * If we are going to sleep and we have plugged IO queued, ++ * make sure to submit it to avoid deadlocks. ++ */ ++ if (blk_needs_flush_plug(tsk)) ++ blk_schedule_flush_plug(tsk); ++} ++ ++static inline void sched_update_worker(struct task_struct *tsk) ++{ ++ if (tsk->flags & (PF_WQ_WORKER | PF_IO_WORKER)) { ++ if (tsk->flags & PF_WQ_WORKER) ++ wq_worker_running(tsk); ++ else ++ io_wq_worker_running(tsk); ++ } ++} ++ ++asmlinkage __visible void __sched schedule(void) ++{ ++ struct task_struct *tsk = current; ++ ++ sched_submit_work(tsk); ++ do { ++ preempt_disable(); ++ __schedule(false); ++ sched_preempt_enable_no_resched(); ++ } while (need_resched()); ++ sched_update_worker(tsk); ++} ++ ++EXPORT_SYMBOL(schedule); ++ ++/* ++ * synchronize_rcu_tasks() makes sure that no task is stuck in preempted ++ * state (have scheduled out non-voluntarily) by making sure that all ++ * tasks have either left the run queue or have gone into user space. ++ * As idle tasks do not do either, they must not ever be preempted ++ * (schedule out non-voluntarily). ++ * ++ * schedule_idle() is similar to schedule_preempt_disable() except that it ++ * never enables preemption because it does not call sched_submit_work(). ++ */ ++void __sched schedule_idle(void) ++{ ++ /* ++ * As this skips calling sched_submit_work(), which the idle task does ++ * regardless because that function is a nop when the task is in a ++ * TASK_RUNNING state, make sure this isn't used someplace that the ++ * current task can be in any other state. Note, idle is always in the ++ * TASK_RUNNING state. ++ */ ++ WARN_ON_ONCE(current->state); ++ do { ++ __schedule(false); ++ } while (need_resched()); ++} ++ ++#ifdef CONFIG_CONTEXT_TRACKING ++asmlinkage __visible void __sched schedule_user(void) ++{ ++ /* ++ * If we come here after a random call to set_need_resched(), ++ * or we have been woken up remotely but the IPI has not yet arrived, ++ * we haven't yet exited the RCU idle mode. Do it here manually until ++ * we find a better solution. ++ * ++ * NB: There are buggy callers of this function. Ideally we ++ * should warn if prev_state != IN_USER, but that will trigger ++ * too frequently to make sense yet. ++ */ ++ enum ctx_state prev_state = exception_enter(); ++ schedule(); ++ exception_exit(prev_state); ++} ++#endif ++ ++/** ++ * schedule_preempt_disabled - called with preemption disabled ++ * ++ * Returns with preemption disabled. Note: preempt_count must be 1 ++ */ ++void __sched schedule_preempt_disabled(void) ++{ ++ sched_preempt_enable_no_resched(); ++ schedule(); ++ preempt_disable(); ++} ++ ++static void __sched notrace preempt_schedule_common(void) ++{ ++ do { ++ /* ++ * Because the function tracer can trace preempt_count_sub() ++ * and it also uses preempt_enable/disable_notrace(), if ++ * NEED_RESCHED is set, the preempt_enable_notrace() called ++ * by the function tracer will call this function again and ++ * cause infinite recursion. ++ * ++ * Preemption must be disabled here before the function ++ * tracer can trace. Break up preempt_disable() into two ++ * calls. One to disable preemption without fear of being ++ * traced. The other to still record the preemption latency, ++ * which can also be traced by the function tracer. ++ */ ++ preempt_disable_notrace(); ++ preempt_latency_start(1); ++ __schedule(true); ++ preempt_latency_stop(1); ++ preempt_enable_no_resched_notrace(); ++ ++ /* ++ * Check again in case we missed a preemption opportunity ++ * between schedule and now. ++ */ ++ } while (need_resched()); ++} ++ ++#ifdef CONFIG_PREEMPTION ++/* ++ * This is the entry point to schedule() from in-kernel preemption ++ * off of preempt_enable. ++ */ ++asmlinkage __visible void __sched notrace preempt_schedule(void) ++{ ++ /* ++ * If there is a non-zero preempt_count or interrupts are disabled, ++ * we do not want to preempt the current task. Just return.. ++ */ ++ if (likely(!preemptible())) ++ return; ++ ++ preempt_schedule_common(); ++} ++NOKPROBE_SYMBOL(preempt_schedule); ++EXPORT_SYMBOL(preempt_schedule); ++ ++/** ++ * preempt_schedule_notrace - preempt_schedule called by tracing ++ * ++ * The tracing infrastructure uses preempt_enable_notrace to prevent ++ * recursion and tracing preempt enabling caused by the tracing ++ * infrastructure itself. But as tracing can happen in areas coming ++ * from userspace or just about to enter userspace, a preempt enable ++ * can occur before user_exit() is called. This will cause the scheduler ++ * to be called when the system is still in usermode. ++ * ++ * To prevent this, the preempt_enable_notrace will use this function ++ * instead of preempt_schedule() to exit user context if needed before ++ * calling the scheduler. ++ */ ++asmlinkage __visible void __sched notrace preempt_schedule_notrace(void) ++{ ++ enum ctx_state prev_ctx; ++ ++ if (likely(!preemptible())) ++ return; ++ ++ do { ++ /* ++ * Because the function tracer can trace preempt_count_sub() ++ * and it also uses preempt_enable/disable_notrace(), if ++ * NEED_RESCHED is set, the preempt_enable_notrace() called ++ * by the function tracer will call this function again and ++ * cause infinite recursion. ++ * ++ * Preemption must be disabled here before the function ++ * tracer can trace. Break up preempt_disable() into two ++ * calls. One to disable preemption without fear of being ++ * traced. The other to still record the preemption latency, ++ * which can also be traced by the function tracer. ++ */ ++ preempt_disable_notrace(); ++ preempt_latency_start(1); ++ /* ++ * Needs preempt disabled in case user_exit() is traced ++ * and the tracer calls preempt_enable_notrace() causing ++ * an infinite recursion. ++ */ ++ prev_ctx = exception_enter(); ++ __schedule(true); ++ exception_exit(prev_ctx); ++ ++ preempt_latency_stop(1); ++ preempt_enable_no_resched_notrace(); ++ } while (need_resched()); ++} ++EXPORT_SYMBOL_GPL(preempt_schedule_notrace); ++ ++#endif /* CONFIG_PREEMPTION */ ++ ++/* ++ * This is the entry point to schedule() from kernel preemption ++ * off of irq context. ++ * Note, that this is called and return with irqs disabled. This will ++ * protect us against recursive calling from irq. ++ */ ++asmlinkage __visible void __sched preempt_schedule_irq(void) ++{ ++ enum ctx_state prev_state; ++ ++ /* Catch callers which need to be fixed */ ++ BUG_ON(preempt_count() || !irqs_disabled()); ++ ++ prev_state = exception_enter(); ++ ++ do { ++ preempt_disable(); ++ local_irq_enable(); ++ __schedule(true); ++ local_irq_disable(); ++ sched_preempt_enable_no_resched(); ++ } while (need_resched()); ++ ++ exception_exit(prev_state); ++} ++ ++int default_wake_function(wait_queue_entry_t *curr, unsigned mode, int wake_flags, ++ void *key) ++{ ++ return try_to_wake_up(curr->private, mode, wake_flags); ++} ++EXPORT_SYMBOL(default_wake_function); ++ ++#ifdef CONFIG_RT_MUTEXES ++ ++static inline int __rt_effective_prio(struct task_struct *pi_task, int prio) ++{ ++ if (pi_task) ++ prio = min(prio, pi_task->prio); ++ ++ return prio; ++} ++ ++static inline int rt_effective_prio(struct task_struct *p, int prio) ++{ ++ struct task_struct *pi_task = rt_mutex_get_top_task(p); ++ ++ return __rt_effective_prio(pi_task, prio); ++} ++ ++/* ++ * rt_mutex_setprio - set the current priority of a task ++ * @p: task to boost ++ * @pi_task: donor task ++ * ++ * This function changes the 'effective' priority of a task. It does ++ * not touch ->normal_prio like __setscheduler(). ++ * ++ * Used by the rt_mutex code to implement priority inheritance ++ * logic. Call site only calls if the priority of the task changed. ++ */ ++void rt_mutex_setprio(struct task_struct *p, struct task_struct *pi_task) ++{ ++ int prio, oldprio; ++ struct rq *rq; ++ ++ /* XXX used to be waiter->prio, not waiter->task->prio */ ++ prio = __rt_effective_prio(pi_task, p->normal_prio); ++ ++ /* ++ * If nothing changed; bail early. ++ */ ++ if (p->pi_top_task == pi_task && prio == p->prio) ++ return; ++ ++ rq = __task_rq_lock(p, NULL); ++ update_rq_clock(rq); ++ /* ++ * Set under pi_lock && rq->lock, such that the value can be used under ++ * either lock. ++ * ++ * Note that there is loads of tricky to make this pointer cache work ++ * right. rt_mutex_slowunlock()+rt_mutex_postunlock() work together to ++ * ensure a task is de-boosted (pi_task is set to NULL) before the ++ * task is allowed to run again (and can exit). This ensures the pointer ++ * points to a blocked task -- which guaratees the task is present. ++ */ ++ p->pi_top_task = pi_task; ++ ++ /* ++ * For FIFO/RR we only need to set prio, if that matches we're done. ++ */ ++ if (prio == p->prio) ++ goto out_unlock; ++ ++ /* ++ * Idle task boosting is a nono in general. There is one ++ * exception, when PREEMPT_RT and NOHZ is active: ++ * ++ * The idle task calls get_next_timer_interrupt() and holds ++ * the timer wheel base->lock on the CPU and another CPU wants ++ * to access the timer (probably to cancel it). We can safely ++ * ignore the boosting request, as the idle CPU runs this code ++ * with interrupts disabled and will complete the lock ++ * protected section without being interrupted. So there is no ++ * real need to boost. ++ */ ++ if (unlikely(p == rq->idle)) { ++ WARN_ON(p != rq->curr); ++ WARN_ON(p->pi_blocked_on); ++ goto out_unlock; ++ } ++ ++ trace_sched_pi_setprio(p, pi_task); ++ oldprio = p->prio; ++ p->prio = prio; ++ if (task_running(rq, p)){ ++ if (prio > oldprio) ++ resched_task(p); ++ } else if (task_queued(p)) { ++ dequeue_task(rq, p, DEQUEUE_SAVE); ++ enqueue_task(rq, p, ENQUEUE_RESTORE); ++ if (prio < oldprio) ++ try_preempt(p, rq); ++ } ++out_unlock: ++ __task_rq_unlock(rq, NULL); ++} ++#else ++static inline int rt_effective_prio(struct task_struct *p, int prio) ++{ ++ return prio; ++} ++#endif ++ ++/* ++ * Adjust the deadline for when the priority is to change, before it's ++ * changed. ++ */ ++static inline void adjust_deadline(struct task_struct *p, int new_prio) ++{ ++ p->deadline += static_deadline_diff(new_prio) - task_deadline_diff(p); ++} ++ ++void set_user_nice(struct task_struct *p, long nice) ++{ ++ int new_static, old_static; ++ struct rq_flags rf; ++ struct rq *rq; ++ ++ if (task_nice(p) == nice || nice < MIN_NICE || nice > MAX_NICE) ++ return; ++ new_static = NICE_TO_PRIO(nice); ++ /* ++ * We have to be careful, if called from sys_setpriority(), ++ * the task might be in the middle of scheduling on another CPU. ++ */ ++ rq = task_rq_lock(p, &rf); ++ update_rq_clock(rq); ++ ++ /* ++ * The RT priorities are set via sched_setscheduler(), but we still ++ * allow the 'normal' nice value to be set - but as expected ++ * it wont have any effect on scheduling until the task is ++ * not SCHED_NORMAL/SCHED_BATCH: ++ */ ++ if (has_rt_policy(p)) { ++ p->static_prio = new_static; ++ goto out_unlock; ++ } ++ ++ adjust_deadline(p, new_static); ++ old_static = p->static_prio; ++ p->static_prio = new_static; ++ p->prio = effective_prio(p); ++ ++ if (task_queued(p)) { ++ dequeue_task(rq, p, DEQUEUE_SAVE); ++ enqueue_task(rq, p, ENQUEUE_RESTORE); ++ if (new_static < old_static) ++ try_preempt(p, rq); ++ } else if (task_running(rq, p)) { ++ set_rq_task(rq, p); ++ if (old_static < new_static) ++ resched_task(p); ++ } ++out_unlock: ++ task_rq_unlock(rq, p, &rf); ++} ++EXPORT_SYMBOL(set_user_nice); ++ ++/* ++ * can_nice - check if a task can reduce its nice value ++ * @p: task ++ * @nice: nice value ++ */ ++int can_nice(const struct task_struct *p, const int nice) ++{ ++ /* Convert nice value [19,-20] to rlimit style value [1,40] */ ++ int nice_rlim = nice_to_rlimit(nice); ++ ++ return (nice_rlim <= task_rlimit(p, RLIMIT_NICE) || ++ capable(CAP_SYS_NICE)); ++} ++ ++#ifdef __ARCH_WANT_SYS_NICE ++ ++/* ++ * sys_nice - change the priority of the current process. ++ * @increment: priority increment ++ * ++ * sys_setpriority is a more generic, but much slower function that ++ * does similar things. ++ */ ++SYSCALL_DEFINE1(nice, int, increment) ++{ ++ long nice, retval; ++ ++ /* ++ * Setpriority might change our priority at the same moment. ++ * We don't have to worry. Conceptually one call occurs first ++ * and we have a single winner. ++ */ ++ ++ increment = clamp(increment, -NICE_WIDTH, NICE_WIDTH); ++ nice = task_nice(current) + increment; ++ ++ nice = clamp_val(nice, MIN_NICE, MAX_NICE); ++ if (increment < 0 && !can_nice(current, nice)) ++ return -EPERM; ++ ++ retval = security_task_setnice(current, nice); ++ if (retval) ++ return retval; ++ ++ set_user_nice(current, nice); ++ return 0; ++} ++ ++#endif ++ ++/** ++ * task_prio - return the priority value of a given task. ++ * @p: the task in question. ++ * ++ * Return: The priority value as seen by users in /proc. ++ * RT tasks are offset by -100. Normal tasks are centered around 1, value goes ++ * from 0 (SCHED_ISO) up to 82 (nice +19 SCHED_IDLEPRIO). ++ */ ++int task_prio(const struct task_struct *p) ++{ ++ int delta, prio = p->prio - MAX_RT_PRIO; ++ ++ /* rt tasks and iso tasks */ ++ if (prio <= 0) ++ goto out; ++ ++ /* Convert to ms to avoid overflows */ ++ delta = NS_TO_MS(p->deadline - task_rq(p)->niffies); ++ if (unlikely(delta < 0)) ++ delta = 0; ++ delta = delta * 40 / ms_longest_deadline_diff(); ++ if (delta <= 80) ++ prio += delta; ++ if (idleprio_task(p)) ++ prio += 40; ++out: ++ return prio; ++} ++ ++/** ++ * idle_cpu - is a given CPU idle currently? ++ * @cpu: the processor in question. ++ * ++ * Return: 1 if the CPU is currently idle. 0 otherwise. ++ */ ++int idle_cpu(int cpu) ++{ ++ return cpu_curr(cpu) == cpu_rq(cpu)->idle; ++} ++ ++/** ++ * available_idle_cpu - is a given CPU idle for enqueuing work. ++ * @cpu: the CPU in question. ++ * ++ * Return: 1 if the CPU is currently idle. 0 otherwise. ++ */ ++int available_idle_cpu(int cpu) ++{ ++ if (!idle_cpu(cpu)) ++ return 0; ++ ++ if (vcpu_is_preempted(cpu)) ++ return 0; ++ ++ return 1; ++} ++ ++/** ++ * idle_task - return the idle task for a given CPU. ++ * @cpu: the processor in question. ++ * ++ * Return: The idle task for the CPU @cpu. ++ */ ++struct task_struct *idle_task(int cpu) ++{ ++ return cpu_rq(cpu)->idle; ++} ++ ++/** ++ * find_process_by_pid - find a process with a matching PID value. ++ * @pid: the pid in question. ++ * ++ * The task of @pid, if found. %NULL otherwise. ++ */ ++static inline struct task_struct *find_process_by_pid(pid_t pid) ++{ ++ return pid ? find_task_by_vpid(pid) : current; ++} ++ ++/* Actually do priority change: must hold rq lock. */ ++static void __setscheduler(struct task_struct *p, struct rq *rq, int policy, ++ int prio, const struct sched_attr *attr, ++ bool keep_boost) ++{ ++ int oldrtprio, oldprio; ++ ++ /* ++ * If params can't change scheduling class changes aren't allowed ++ * either. ++ */ ++ if (attr->sched_flags & SCHED_FLAG_KEEP_PARAMS) ++ return; ++ ++ p->policy = policy; ++ oldrtprio = p->rt_priority; ++ p->rt_priority = prio; ++ p->normal_prio = normal_prio(p); ++ oldprio = p->prio; ++ /* ++ * Keep a potential priority boosting if called from ++ * sched_setscheduler(). ++ */ ++ p->prio = normal_prio(p); ++ if (keep_boost) ++ p->prio = rt_effective_prio(p, p->prio); ++ ++ if (task_running(rq, p)) { ++ set_rq_task(rq, p); ++ resched_task(p); ++ } else if (task_queued(p)) { ++ dequeue_task(rq, p, DEQUEUE_SAVE); ++ enqueue_task(rq, p, ENQUEUE_RESTORE); ++ if (p->prio < oldprio || p->rt_priority > oldrtprio) ++ try_preempt(p, rq); ++ } ++} ++ ++/* ++ * Check the target process has a UID that matches the current process's ++ */ ++static bool check_same_owner(struct task_struct *p) ++{ ++ const struct cred *cred = current_cred(), *pcred; ++ bool match; ++ ++ rcu_read_lock(); ++ pcred = __task_cred(p); ++ match = (uid_eq(cred->euid, pcred->euid) || ++ uid_eq(cred->euid, pcred->uid)); ++ rcu_read_unlock(); ++ return match; ++} ++ ++static int __sched_setscheduler(struct task_struct *p, ++ const struct sched_attr *attr, ++ bool user, bool pi) ++{ ++ int retval, policy = attr->sched_policy, oldpolicy = -1, priority = attr->sched_priority; ++ unsigned long rlim_rtprio = 0; ++ struct rq_flags rf; ++ int reset_on_fork; ++ struct rq *rq; ++ ++ /* The pi code expects interrupts enabled */ ++ BUG_ON(pi && in_interrupt()); ++ ++ if (is_rt_policy(policy) && !capable(CAP_SYS_NICE)) { ++ unsigned long lflags; ++ ++ if (!lock_task_sighand(p, &lflags)) ++ return -ESRCH; ++ rlim_rtprio = task_rlimit(p, RLIMIT_RTPRIO); ++ unlock_task_sighand(p, &lflags); ++ if (rlim_rtprio) ++ goto recheck; ++ /* ++ * If the caller requested an RT policy without having the ++ * necessary rights, we downgrade the policy to SCHED_ISO. ++ * We also set the parameter to zero to pass the checks. ++ */ ++ policy = SCHED_ISO; ++ priority = 0; ++ } ++recheck: ++ /* Double check policy once rq lock held */ ++ if (policy < 0) { ++ reset_on_fork = p->sched_reset_on_fork; ++ policy = oldpolicy = p->policy; ++ } else { ++ reset_on_fork = !!(policy & SCHED_RESET_ON_FORK); ++ policy &= ~SCHED_RESET_ON_FORK; ++ ++ if (!SCHED_RANGE(policy)) ++ return -EINVAL; ++ } ++ ++ if (attr->sched_flags & ~(SCHED_FLAG_ALL | SCHED_FLAG_SUGOV)) ++ return -EINVAL; ++ ++ /* ++ * Valid priorities for SCHED_FIFO and SCHED_RR are ++ * 1..MAX_USER_RT_PRIO-1, valid priority for SCHED_NORMAL and ++ * SCHED_BATCH is 0. ++ */ ++ if (priority < 0 || ++ (p->mm && priority > MAX_USER_RT_PRIO - 1) || ++ (!p->mm && priority > MAX_RT_PRIO - 1)) ++ return -EINVAL; ++ if (is_rt_policy(policy) != (priority != 0)) ++ return -EINVAL; ++ ++ /* ++ * Allow unprivileged RT tasks to decrease priority: ++ */ ++ if (user && !capable(CAP_SYS_NICE)) { ++ if (is_rt_policy(policy)) { ++ unsigned long rlim_rtprio = ++ task_rlimit(p, RLIMIT_RTPRIO); ++ ++ /* Can't set/change the rt policy */ ++ if (policy != p->policy && !rlim_rtprio) ++ return -EPERM; ++ ++ /* Can't increase priority */ ++ if (priority > p->rt_priority && ++ priority > rlim_rtprio) ++ return -EPERM; ++ } else { ++ switch (p->policy) { ++ /* ++ * Can only downgrade policies but not back to ++ * SCHED_NORMAL ++ */ ++ case SCHED_ISO: ++ if (policy == SCHED_ISO) ++ goto out; ++ if (policy != SCHED_NORMAL) ++ return -EPERM; ++ break; ++ case SCHED_BATCH: ++ if (policy == SCHED_BATCH) ++ goto out; ++ if (policy != SCHED_IDLEPRIO) ++ return -EPERM; ++ break; ++ case SCHED_IDLEPRIO: ++ if (policy == SCHED_IDLEPRIO) ++ goto out; ++ return -EPERM; ++ default: ++ break; ++ } ++ } ++ ++ /* Can't change other user's priorities */ ++ if (!check_same_owner(p)) ++ return -EPERM; ++ ++ /* Normal users shall not reset the sched_reset_on_fork flag: */ ++ if (p->sched_reset_on_fork && !reset_on_fork) ++ return -EPERM; ++ } ++ ++ if (user) { ++ retval = security_task_setscheduler(p); ++ if (retval) ++ return retval; ++ } ++ ++ if (pi) ++ cpuset_read_lock(); ++ ++ /* ++ * Make sure no PI-waiters arrive (or leave) while we are ++ * changing the priority of the task: ++ * ++ * To be able to change p->policy safely, the runqueue lock must be ++ * held. ++ */ ++ rq = task_rq_lock(p, &rf); ++ update_rq_clock(rq); ++ ++ /* ++ * Changing the policy of the stop threads its a very bad idea: ++ */ ++ if (p == rq->stop) { ++ retval = -EINVAL; ++ goto unlock; ++ } ++ ++ /* ++ * If not changing anything there's no need to proceed further: ++ */ ++ if (unlikely(policy == p->policy && (!is_rt_policy(policy) || ++ priority == p->rt_priority))) { ++ retval = 0; ++ goto unlock; ++ } ++ ++ /* Re-check policy now with rq lock held */ ++ if (unlikely(oldpolicy != -1 && oldpolicy != p->policy)) { ++ policy = oldpolicy = -1; ++ task_rq_unlock(rq, p, &rf); ++ if (pi) ++ cpuset_read_unlock(); ++ goto recheck; ++ } ++ p->sched_reset_on_fork = reset_on_fork; ++ ++ __setscheduler(p, rq, policy, priority, attr, pi); ++ ++ /* Avoid rq from going away on us: */ ++ preempt_disable(); ++ task_rq_unlock(rq, p, &rf); ++ ++ if (pi) { ++ cpuset_read_unlock(); ++ rt_mutex_adjust_pi(p); ++ } ++ preempt_enable(); ++out: ++ return 0; ++ ++unlock: ++ task_rq_unlock(rq, p, &rf); ++ if (pi) ++ cpuset_read_unlock(); ++ return retval; ++} ++ ++static int _sched_setscheduler(struct task_struct *p, int policy, ++ const struct sched_param *param, bool check) ++{ ++ struct sched_attr attr = { ++ .sched_policy = policy, ++ .sched_priority = param->sched_priority, ++ .sched_nice = PRIO_TO_NICE(p->static_prio), ++ }; ++ ++ return __sched_setscheduler(p, &attr, check, true); ++} ++/** ++ * sched_setscheduler - change the scheduling policy and/or RT priority of a thread. ++ * @p: the task in question. ++ * @policy: new policy. ++ * @param: structure containing the new RT priority. ++ * ++ * Return: 0 on success. An error code otherwise. ++ * ++ * NOTE that the task may be already dead. ++ */ ++int sched_setscheduler(struct task_struct *p, int policy, ++ const struct sched_param *param) ++{ ++ return _sched_setscheduler(p, policy, param, true); ++} ++ ++EXPORT_SYMBOL_GPL(sched_setscheduler); ++ ++int sched_setattr(struct task_struct *p, const struct sched_attr *attr) ++{ ++ return __sched_setscheduler(p, attr, true, true); ++} ++EXPORT_SYMBOL_GPL(sched_setattr); ++ ++int sched_setattr_nocheck(struct task_struct *p, const struct sched_attr *attr) ++{ ++ return __sched_setscheduler(p, attr, false, true); ++} ++ ++/** ++ * sched_setscheduler_nocheck - change the scheduling policy and/or RT priority of a thread from kernelspace. ++ * @p: the task in question. ++ * @policy: new policy. ++ * @param: structure containing the new RT priority. ++ * ++ * Just like sched_setscheduler, only don't bother checking if the ++ * current context has permission. For example, this is needed in ++ * stop_machine(): we create temporary high priority worker threads, ++ * but our caller might not have that capability. ++ * ++ * Return: 0 on success. An error code otherwise. ++ */ ++int sched_setscheduler_nocheck(struct task_struct *p, int policy, ++ const struct sched_param *param) ++{ ++ return _sched_setscheduler(p, policy, param, false); ++} ++EXPORT_SYMBOL_GPL(sched_setscheduler_nocheck); ++ ++static int ++do_sched_setscheduler(pid_t pid, int policy, struct sched_param __user *param) ++{ ++ struct sched_param lparam; ++ struct task_struct *p; ++ int retval; ++ ++ if (!param || pid < 0) ++ return -EINVAL; ++ if (copy_from_user(&lparam, param, sizeof(struct sched_param))) ++ return -EFAULT; ++ ++ rcu_read_lock(); ++ retval = -ESRCH; ++ p = find_process_by_pid(pid); ++ if (likely(p)) ++ get_task_struct(p); ++ rcu_read_unlock(); ++ ++ if (likely(p)) { ++ retval = sched_setscheduler(p, policy, &lparam); ++ put_task_struct(p); ++ } ++ ++ return retval; ++} ++ ++/* ++ * Mimics kernel/events/core.c perf_copy_attr(). ++ */ ++static int sched_copy_attr(struct sched_attr __user *uattr, ++ struct sched_attr *attr) ++{ ++ u32 size; ++ int ret; ++ ++ /* Zero the full structure, so that a short copy will be nice: */ ++ memset(attr, 0, sizeof(*attr)); ++ ++ ret = get_user(size, &uattr->size); ++ if (ret) ++ return ret; ++ ++ /* ABI compatibility quirk: */ ++ if (!size) ++ size = SCHED_ATTR_SIZE_VER0; ++ ++ if (size < SCHED_ATTR_SIZE_VER0 || size > PAGE_SIZE) ++ goto err_size; ++ ++ ret = copy_struct_from_user(attr, sizeof(*attr), uattr, size); ++ if (ret) { ++ if (ret == -E2BIG) ++ goto err_size; ++ return ret; ++ } ++ ++ /* ++ * XXX: Do we want to be lenient like existing syscalls; or do we want ++ * to be strict and return an error on out-of-bounds values? ++ */ ++ attr->sched_nice = clamp(attr->sched_nice, -20, 19); ++ ++ /* sched/core.c uses zero here but we already know ret is zero */ ++ return 0; ++ ++err_size: ++ put_user(sizeof(*attr), &uattr->size); ++ return -E2BIG; ++} ++ ++/* ++ * sched_setparam() passes in -1 for its policy, to let the functions ++ * it calls know not to change it. ++ */ ++#define SETPARAM_POLICY -1 ++ ++/** ++ * sys_sched_setscheduler - set/change the scheduler policy and RT priority ++ * @pid: the pid in question. ++ * @policy: new policy. ++ * @param: structure containing the new RT priority. ++ * ++ * Return: 0 on success. An error code otherwise. ++ */ ++SYSCALL_DEFINE3(sched_setscheduler, pid_t, pid, int, policy, struct sched_param __user *, param) ++{ ++ if (policy < 0) ++ return -EINVAL; ++ ++ return do_sched_setscheduler(pid, policy, param); ++} ++ ++/** ++ * sys_sched_setparam - set/change the RT priority of a thread ++ * @pid: the pid in question. ++ * @param: structure containing the new RT priority. ++ * ++ * Return: 0 on success. An error code otherwise. ++ */ ++SYSCALL_DEFINE2(sched_setparam, pid_t, pid, struct sched_param __user *, param) ++{ ++ return do_sched_setscheduler(pid, SETPARAM_POLICY, param); ++} ++ ++/** ++ * sys_sched_setattr - same as above, but with extended sched_attr ++ * @pid: the pid in question. ++ * @uattr: structure containing the extended parameters. ++ */ ++SYSCALL_DEFINE3(sched_setattr, pid_t, pid, struct sched_attr __user *, uattr, ++ unsigned int, flags) ++{ ++ struct sched_attr attr; ++ struct task_struct *p; ++ int retval; ++ ++ if (!uattr || pid < 0 || flags) ++ return -EINVAL; ++ ++ retval = sched_copy_attr(uattr, &attr); ++ if (retval) ++ return retval; ++ ++ if ((int)attr.sched_policy < 0) ++ return -EINVAL; ++ if (attr.sched_flags & SCHED_FLAG_KEEP_POLICY) ++ attr.sched_policy = SETPARAM_POLICY; ++ ++ rcu_read_lock(); ++ retval = -ESRCH; ++ p = find_process_by_pid(pid); ++ if (likely(p)) ++ get_task_struct(p); ++ rcu_read_unlock(); ++ ++ if (likely(p)) { ++ retval = sched_setattr(p, &attr); ++ put_task_struct(p); ++ } ++ ++ return retval; ++} ++ ++/** ++ * sys_sched_getscheduler - get the policy (scheduling class) of a thread ++ * @pid: the pid in question. ++ * ++ * Return: On success, the policy of the thread. Otherwise, a negative error ++ * code. ++ */ ++SYSCALL_DEFINE1(sched_getscheduler, pid_t, pid) ++{ ++ struct task_struct *p; ++ int retval = -EINVAL; ++ ++ if (pid < 0) ++ goto out_nounlock; ++ ++ retval = -ESRCH; ++ rcu_read_lock(); ++ p = find_process_by_pid(pid); ++ if (p) { ++ retval = security_task_getscheduler(p); ++ if (!retval) ++ retval = p->policy; ++ } ++ rcu_read_unlock(); ++ ++out_nounlock: ++ return retval; ++} ++ ++/** ++ * sys_sched_getscheduler - get the RT priority of a thread ++ * @pid: the pid in question. ++ * @param: structure containing the RT priority. ++ * ++ * Return: On success, 0 and the RT priority is in @param. Otherwise, an error ++ * code. ++ */ ++SYSCALL_DEFINE2(sched_getparam, pid_t, pid, struct sched_param __user *, param) ++{ ++ struct sched_param lp = { .sched_priority = 0 }; ++ struct task_struct *p; ++ int retval = -EINVAL; ++ ++ if (!param || pid < 0) ++ goto out_nounlock; ++ ++ rcu_read_lock(); ++ p = find_process_by_pid(pid); ++ retval = -ESRCH; ++ if (!p) ++ goto out_unlock; ++ ++ retval = security_task_getscheduler(p); ++ if (retval) ++ goto out_unlock; ++ ++ if (has_rt_policy(p)) ++ lp.sched_priority = p->rt_priority; ++ rcu_read_unlock(); ++ ++ /* ++ * This one might sleep, we cannot do it with a spinlock held ... ++ */ ++ retval = copy_to_user(param, &lp, sizeof(*param)) ? -EFAULT : 0; ++ ++out_nounlock: ++ return retval; ++ ++out_unlock: ++ rcu_read_unlock(); ++ return retval; ++} ++ ++/* ++ * Copy the kernel size attribute structure (which might be larger ++ * than what user-space knows about) to user-space. ++ * ++ * Note that all cases are valid: user-space buffer can be larger or ++ * smaller than the kernel-space buffer. The usual case is that both ++ * have the same size. ++ */ ++static int ++sched_attr_copy_to_user(struct sched_attr __user *uattr, ++ struct sched_attr *kattr, ++ unsigned int usize) ++{ ++ unsigned int ksize = sizeof(*kattr); ++ ++ if (!access_ok(uattr, usize)) ++ return -EFAULT; ++ ++ /* ++ * sched_getattr() ABI forwards and backwards compatibility: ++ * ++ * If usize == ksize then we just copy everything to user-space and all is good. ++ * ++ * If usize < ksize then we only copy as much as user-space has space for, ++ * this keeps ABI compatibility as well. We skip the rest. ++ * ++ * If usize > ksize then user-space is using a newer version of the ABI, ++ * which part the kernel doesn't know about. Just ignore it - tooling can ++ * detect the kernel's knowledge of attributes from the attr->size value ++ * which is set to ksize in this case. ++ */ ++ kattr->size = min(usize, ksize); ++ ++ if (copy_to_user(uattr, kattr, kattr->size)) ++ return -EFAULT; ++ ++ return 0; ++} ++ ++/** ++ * sys_sched_getattr - similar to sched_getparam, but with sched_attr ++ * @pid: the pid in question. ++ * @uattr: structure containing the extended parameters. ++ * @usize: sizeof(attr) for fwd/bwd comp. ++ * @flags: for future extension. ++ */ ++SYSCALL_DEFINE4(sched_getattr, pid_t, pid, struct sched_attr __user *, uattr, ++ unsigned int, usize, unsigned int, flags) ++{ ++ struct sched_attr kattr = { }; ++ struct task_struct *p; ++ int retval; ++ ++ if (!uattr || pid < 0 || usize > PAGE_SIZE || ++ usize < SCHED_ATTR_SIZE_VER0 || flags) ++ return -EINVAL; ++ ++ rcu_read_lock(); ++ p = find_process_by_pid(pid); ++ retval = -ESRCH; ++ if (!p) ++ goto out_unlock; ++ ++ retval = security_task_getscheduler(p); ++ if (retval) ++ goto out_unlock; ++ ++ kattr.sched_policy = p->policy; ++ if (rt_task(p)) ++ kattr.sched_priority = p->rt_priority; ++ else ++ kattr.sched_nice = task_nice(p); ++ ++ rcu_read_unlock(); ++ ++ return sched_attr_copy_to_user(uattr, &kattr, usize); ++ ++out_unlock: ++ rcu_read_unlock(); ++ return retval; ++} ++ ++long sched_setaffinity(pid_t pid, const struct cpumask *in_mask) ++{ ++ cpumask_var_t cpus_allowed, new_mask; ++ struct task_struct *p; ++ int retval; ++ ++ rcu_read_lock(); ++ ++ p = find_process_by_pid(pid); ++ if (!p) { ++ rcu_read_unlock(); ++ return -ESRCH; ++ } ++ ++ /* Prevent p going away */ ++ get_task_struct(p); ++ rcu_read_unlock(); ++ ++ if (p->flags & PF_NO_SETAFFINITY) { ++ retval = -EINVAL; ++ goto out_put_task; ++ } ++ if (!alloc_cpumask_var(&cpus_allowed, GFP_KERNEL)) { ++ retval = -ENOMEM; ++ goto out_put_task; ++ } ++ if (!alloc_cpumask_var(&new_mask, GFP_KERNEL)) { ++ retval = -ENOMEM; ++ goto out_free_cpus_allowed; ++ } ++ retval = -EPERM; ++ if (!check_same_owner(p)) { ++ rcu_read_lock(); ++ if (!ns_capable(__task_cred(p)->user_ns, CAP_SYS_NICE)) { ++ rcu_read_unlock(); ++ goto out_unlock; ++ } ++ rcu_read_unlock(); ++ } ++ ++ retval = security_task_setscheduler(p); ++ if (retval) ++ goto out_unlock; ++ ++ cpuset_cpus_allowed(p, cpus_allowed); ++ cpumask_and(new_mask, in_mask, cpus_allowed); ++again: ++ retval = __set_cpus_allowed_ptr(p, new_mask, true); ++ ++ if (!retval) { ++ cpuset_cpus_allowed(p, cpus_allowed); ++ if (!cpumask_subset(new_mask, cpus_allowed)) { ++ /* ++ * We must have raced with a concurrent cpuset ++ * update. Just reset the cpus_allowed to the ++ * cpuset's cpus_allowed ++ */ ++ cpumask_copy(new_mask, cpus_allowed); ++ goto again; ++ } ++ } ++out_unlock: ++ free_cpumask_var(new_mask); ++out_free_cpus_allowed: ++ free_cpumask_var(cpus_allowed); ++out_put_task: ++ put_task_struct(p); ++ return retval; ++} ++ ++static int get_user_cpu_mask(unsigned long __user *user_mask_ptr, unsigned len, ++ cpumask_t *new_mask) ++{ ++ if (len < cpumask_size()) ++ cpumask_clear(new_mask); ++ else if (len > cpumask_size()) ++ len = cpumask_size(); ++ ++ return copy_from_user(new_mask, user_mask_ptr, len) ? -EFAULT : 0; ++} ++ ++ ++/** ++ * sys_sched_setaffinity - set the CPU affinity of a process ++ * @pid: pid of the process ++ * @len: length in bytes of the bitmask pointed to by user_mask_ptr ++ * @user_mask_ptr: user-space pointer to the new CPU mask ++ * ++ * Return: 0 on success. An error code otherwise. ++ */ ++SYSCALL_DEFINE3(sched_setaffinity, pid_t, pid, unsigned int, len, ++ unsigned long __user *, user_mask_ptr) ++{ ++ cpumask_var_t new_mask; ++ int retval; ++ ++ if (!alloc_cpumask_var(&new_mask, GFP_KERNEL)) ++ return -ENOMEM; ++ ++ retval = get_user_cpu_mask(user_mask_ptr, len, new_mask); ++ if (retval == 0) ++ retval = sched_setaffinity(pid, new_mask); ++ free_cpumask_var(new_mask); ++ return retval; ++} ++ ++long sched_getaffinity(pid_t pid, cpumask_t *mask) ++{ ++ struct task_struct *p; ++ unsigned long flags; ++ int retval; ++ ++ get_online_cpus(); ++ rcu_read_lock(); ++ ++ retval = -ESRCH; ++ p = find_process_by_pid(pid); ++ if (!p) ++ goto out_unlock; ++ ++ retval = security_task_getscheduler(p); ++ if (retval) ++ goto out_unlock; ++ ++ raw_spin_lock_irqsave(&p->pi_lock, flags); ++ cpumask_and(mask, &p->cpus_mask, cpu_active_mask); ++ raw_spin_unlock_irqrestore(&p->pi_lock, flags); ++ ++out_unlock: ++ rcu_read_unlock(); ++ put_online_cpus(); ++ ++ return retval; ++} ++ ++/** ++ * sys_sched_getaffinity - get the CPU affinity of a process ++ * @pid: pid of the process ++ * @len: length in bytes of the bitmask pointed to by user_mask_ptr ++ * @user_mask_ptr: user-space pointer to hold the current CPU mask ++ * ++ * Return: 0 on success. An error code otherwise. ++ */ ++SYSCALL_DEFINE3(sched_getaffinity, pid_t, pid, unsigned int, len, ++ unsigned long __user *, user_mask_ptr) ++{ ++ int ret; ++ cpumask_var_t mask; ++ ++ if ((len * BITS_PER_BYTE) < nr_cpu_ids) ++ return -EINVAL; ++ if (len & (sizeof(unsigned long)-1)) ++ return -EINVAL; ++ ++ if (!alloc_cpumask_var(&mask, GFP_KERNEL)) ++ return -ENOMEM; ++ ++ ret = sched_getaffinity(pid, mask); ++ if (ret == 0) { ++ unsigned int retlen = min(len, cpumask_size()); ++ ++ if (copy_to_user(user_mask_ptr, mask, retlen)) ++ ret = -EFAULT; ++ else ++ ret = retlen; ++ } ++ free_cpumask_var(mask); ++ ++ return ret; ++} ++ ++/** ++ * sys_sched_yield - yield the current processor to other threads. ++ * ++ * This function yields the current CPU to other tasks. It does this by ++ * scheduling away the current task. If it still has the earliest deadline ++ * it will be scheduled again as the next task. ++ * ++ * Return: 0. ++ */ ++static void do_sched_yield(void) ++{ ++ struct rq *rq; ++ ++ if (!sched_yield_type) ++ return; ++ ++ local_irq_disable(); ++ rq = this_rq(); ++ rq_lock(rq); ++ ++ if (sched_yield_type > 1) ++ time_slice_expired(current, rq); ++ schedstat_inc(rq->yld_count); ++ ++ /* ++ * Since we are going to call schedule() anyway, there's ++ * no need to preempt or enable interrupts: ++ */ ++ preempt_disable(); ++ rq_unlock(rq); ++ sched_preempt_enable_no_resched(); ++ ++ schedule(); ++} ++ ++SYSCALL_DEFINE0(sched_yield) ++{ ++ do_sched_yield(); ++ return 0; ++} ++ ++#ifndef CONFIG_PREEMPTION ++int __sched _cond_resched(void) ++{ ++ if (should_resched(0)) { ++ preempt_schedule_common(); ++ return 1; ++ } ++ rcu_all_qs(); ++ return 0; ++} ++EXPORT_SYMBOL(_cond_resched); ++#endif ++ ++/* ++ * __cond_resched_lock() - if a reschedule is pending, drop the given lock, ++ * call schedule, and on return reacquire the lock. ++ * ++ * This works OK both with and without CONFIG_PREEMPTION. We do strange low-level ++ * operations here to prevent schedule() from being called twice (once via ++ * spin_unlock(), once by hand). ++ */ ++int __cond_resched_lock(spinlock_t *lock) ++{ ++ int resched = should_resched(PREEMPT_LOCK_OFFSET); ++ int ret = 0; ++ ++ lockdep_assert_held(lock); ++ ++ if (spin_needbreak(lock) || resched) { ++ spin_unlock(lock); ++ if (resched) ++ preempt_schedule_common(); ++ else ++ cpu_relax(); ++ ret = 1; ++ spin_lock(lock); ++ } ++ return ret; ++} ++EXPORT_SYMBOL(__cond_resched_lock); ++ ++/** ++ * yield - yield the current processor to other threads. ++ * ++ * Do not ever use this function, there's a 99% chance you're doing it wrong. ++ * ++ * The scheduler is at all times free to pick the calling task as the most ++ * eligible task to run, if removing the yield() call from your code breaks ++ * it, its already broken. ++ * ++ * Typical broken usage is: ++ * ++ * while (!event) ++ * yield(); ++ * ++ * where one assumes that yield() will let 'the other' process run that will ++ * make event true. If the current task is a SCHED_FIFO task that will never ++ * happen. Never use yield() as a progress guarantee!! ++ * ++ * If you want to use yield() to wait for something, use wait_event(). ++ * If you want to use yield() to be 'nice' for others, use cond_resched(). ++ * If you still want to use yield(), do not! ++ */ ++void __sched yield(void) ++{ ++ set_current_state(TASK_RUNNING); ++ do_sched_yield(); ++} ++EXPORT_SYMBOL(yield); ++ ++/** ++ * yield_to - yield the current processor to another thread in ++ * your thread group, or accelerate that thread toward the ++ * processor it's on. ++ * @p: target task ++ * @preempt: whether task preemption is allowed or not ++ * ++ * It's the caller's job to ensure that the target task struct ++ * can't go away on us before we can do any checks. ++ * ++ * Return: ++ * true (>0) if we indeed boosted the target task. ++ * false (0) if we failed to boost the target. ++ * -ESRCH if there's no task to yield to. ++ */ ++int __sched yield_to(struct task_struct *p, bool preempt) ++{ ++ struct task_struct *rq_p; ++ struct rq *rq, *p_rq; ++ unsigned long flags; ++ int yielded = 0; ++ ++ local_irq_save(flags); ++ rq = this_rq(); ++ ++again: ++ p_rq = task_rq(p); ++ /* ++ * If we're the only runnable task on the rq and target rq also ++ * has only one task, there's absolutely no point in yielding. ++ */ ++ if (task_running(p_rq, p) || p->state) { ++ yielded = -ESRCH; ++ goto out_irq; ++ } ++ ++ double_rq_lock(rq, p_rq); ++ if (unlikely(task_rq(p) != p_rq)) { ++ double_rq_unlock(rq, p_rq); ++ goto again; ++ } ++ ++ yielded = 1; ++ schedstat_inc(rq->yld_count); ++ rq_p = rq->curr; ++ if (p->deadline > rq_p->deadline) ++ p->deadline = rq_p->deadline; ++ p->time_slice += rq_p->time_slice; ++ if (p->time_slice > timeslice()) ++ p->time_slice = timeslice(); ++ time_slice_expired(rq_p, rq); ++ if (preempt && rq != p_rq) ++ resched_task(p_rq->curr); ++ double_rq_unlock(rq, p_rq); ++out_irq: ++ local_irq_restore(flags); ++ ++ if (yielded > 0) ++ schedule(); ++ return yielded; ++} ++EXPORT_SYMBOL_GPL(yield_to); ++ ++int io_schedule_prepare(void) ++{ ++ int old_iowait = current->in_iowait; ++ ++ current->in_iowait = 1; ++ blk_schedule_flush_plug(current); ++ ++ return old_iowait; ++} ++ ++void io_schedule_finish(int token) ++{ ++ current->in_iowait = token; ++} ++ ++/* ++ * This task is about to go to sleep on IO. Increment rq->nr_iowait so ++ * that process accounting knows that this is a task in IO wait state. ++ * ++ * But don't do that if it is a deliberate, throttling IO wait (this task ++ * has set its backing_dev_info: the queue against which it should throttle) ++ */ ++ ++long __sched io_schedule_timeout(long timeout) ++{ ++ int token; ++ long ret; ++ ++ token = io_schedule_prepare(); ++ ret = schedule_timeout(timeout); ++ io_schedule_finish(token); ++ ++ return ret; ++} ++EXPORT_SYMBOL(io_schedule_timeout); ++ ++void __sched io_schedule(void) ++{ ++ int token; ++ ++ token = io_schedule_prepare(); ++ schedule(); ++ io_schedule_finish(token); ++} ++EXPORT_SYMBOL(io_schedule); ++ ++/** ++ * sys_sched_get_priority_max - return maximum RT priority. ++ * @policy: scheduling class. ++ * ++ * Return: On success, this syscall returns the maximum ++ * rt_priority that can be used by a given scheduling class. ++ * On failure, a negative error code is returned. ++ */ ++SYSCALL_DEFINE1(sched_get_priority_max, int, policy) ++{ ++ int ret = -EINVAL; ++ ++ switch (policy) { ++ case SCHED_FIFO: ++ case SCHED_RR: ++ ret = MAX_USER_RT_PRIO-1; ++ break; ++ case SCHED_NORMAL: ++ case SCHED_BATCH: ++ case SCHED_ISO: ++ case SCHED_IDLEPRIO: ++ ret = 0; ++ break; ++ } ++ return ret; ++} ++ ++/** ++ * sys_sched_get_priority_min - return minimum RT priority. ++ * @policy: scheduling class. ++ * ++ * Return: On success, this syscall returns the minimum ++ * rt_priority that can be used by a given scheduling class. ++ * On failure, a negative error code is returned. ++ */ ++SYSCALL_DEFINE1(sched_get_priority_min, int, policy) ++{ ++ int ret = -EINVAL; ++ ++ switch (policy) { ++ case SCHED_FIFO: ++ case SCHED_RR: ++ ret = 1; ++ break; ++ case SCHED_NORMAL: ++ case SCHED_BATCH: ++ case SCHED_ISO: ++ case SCHED_IDLEPRIO: ++ ret = 0; ++ break; ++ } ++ return ret; ++} ++ ++static int sched_rr_get_interval(pid_t pid, struct timespec64 *t) ++{ ++ struct task_struct *p; ++ unsigned int time_slice; ++ struct rq_flags rf; ++ struct rq *rq; ++ int retval; ++ ++ if (pid < 0) ++ return -EINVAL; ++ ++ retval = -ESRCH; ++ rcu_read_lock(); ++ p = find_process_by_pid(pid); ++ if (!p) ++ goto out_unlock; ++ ++ retval = security_task_getscheduler(p); ++ if (retval) ++ goto out_unlock; ++ ++ rq = task_rq_lock(p, &rf); ++ time_slice = p->policy == SCHED_FIFO ? 0 : MS_TO_NS(task_timeslice(p)); ++ task_rq_unlock(rq, p, &rf); ++ ++ rcu_read_unlock(); ++ *t = ns_to_timespec64(time_slice); ++ return 0; ++ ++out_unlock: ++ rcu_read_unlock(); ++ return retval; ++} ++ ++/** ++ * sys_sched_rr_get_interval - return the default timeslice of a process. ++ * @pid: pid of the process. ++ * @interval: userspace pointer to the timeslice value. ++ * ++ * this syscall writes the default timeslice value of a given process ++ * into the user-space timespec buffer. A value of '0' means infinity. ++ * ++ * Return: On success, 0 and the timeslice is in @interval. Otherwise, ++ * an error code. ++ */ ++SYSCALL_DEFINE2(sched_rr_get_interval, pid_t, pid, ++ struct __kernel_timespec __user *, interval) ++{ ++ struct timespec64 t; ++ int retval = sched_rr_get_interval(pid, &t); ++ ++ if (retval == 0) ++ retval = put_timespec64(&t, interval); ++ ++ return retval; ++} ++ ++#ifdef CONFIG_COMPAT_32BIT_TIME ++SYSCALL_DEFINE2(sched_rr_get_interval_time32, pid_t, pid, ++ struct old_timespec32 __user *, interval) ++{ ++ struct timespec64 t; ++ int retval = sched_rr_get_interval(pid, &t); ++ ++ if (retval == 0) ++ retval = put_old_timespec32(&t, interval); ++ return retval; ++} ++#endif ++ ++void sched_show_task(struct task_struct *p) ++{ ++ unsigned long free = 0; ++ int ppid; ++ ++ if (!try_get_task_stack(p)) ++ return; ++ ++ printk(KERN_INFO "%-15.15s %c", p->comm, task_state_to_char(p)); ++ ++ if (p->state == TASK_RUNNING) ++ printk(KERN_CONT " running task "); ++#ifdef CONFIG_DEBUG_STACK_USAGE ++ free = stack_not_used(p); ++#endif ++ ppid = 0; ++ rcu_read_lock(); ++ if (pid_alive(p)) ++ ppid = task_pid_nr(rcu_dereference(p->real_parent)); ++ rcu_read_unlock(); ++ printk(KERN_CONT "%5lu %5d %6d 0x%08lx\n", free, ++ task_pid_nr(p), ppid, ++ (unsigned long)task_thread_info(p)->flags); ++ ++ print_worker_info(KERN_INFO, p); ++ show_stack(p, NULL); ++ put_task_stack(p); ++} ++EXPORT_SYMBOL_GPL(sched_show_task); ++ ++static inline bool ++state_filter_match(unsigned long state_filter, struct task_struct *p) ++{ ++ /* no filter, everything matches */ ++ if (!state_filter) ++ return true; ++ ++ /* filter, but doesn't match */ ++ if (!(p->state & state_filter)) ++ return false; ++ ++ /* ++ * When looking for TASK_UNINTERRUPTIBLE skip TASK_IDLE (allows ++ * TASK_KILLABLE). ++ */ ++ if (state_filter == TASK_UNINTERRUPTIBLE && p->state == TASK_IDLE) ++ return false; ++ ++ return true; ++} ++ ++void show_state_filter(unsigned long state_filter) ++{ ++ struct task_struct *g, *p; ++ ++#if BITS_PER_LONG == 32 ++ printk(KERN_INFO ++ " task PC stack pid father\n"); ++#else ++ printk(KERN_INFO ++ " task PC stack pid father\n"); ++#endif ++ rcu_read_lock(); ++ for_each_process_thread(g, p) { ++ /* ++ * reset the NMI-timeout, listing all files on a slow ++ * console might take a lot of time: ++ * Also, reset softlockup watchdogs on all CPUs, because ++ * another CPU might be blocked waiting for us to process ++ * an IPI. ++ */ ++ touch_nmi_watchdog(); ++ touch_all_softlockup_watchdogs(); ++ if (state_filter_match(state_filter, p)) ++ sched_show_task(p); ++ } ++ ++ rcu_read_unlock(); ++ /* ++ * Only show locks if all tasks are dumped: ++ */ ++ if (!state_filter) ++ debug_show_all_locks(); ++} ++ ++void dump_cpu_task(int cpu) ++{ ++ pr_info("Task dump for CPU %d:\n", cpu); ++ sched_show_task(cpu_curr(cpu)); ++} ++ ++#ifdef CONFIG_SMP ++void set_cpus_allowed_common(struct task_struct *p, const struct cpumask *new_mask) ++{ ++ cpumask_copy(&p->cpus_mask, new_mask); ++ p->nr_cpus_allowed = cpumask_weight(new_mask); ++} ++ ++void __do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask) ++{ ++ struct rq *rq = task_rq(p); ++ ++ lockdep_assert_held(&p->pi_lock); ++ ++ cpumask_copy(&p->cpus_mask, new_mask); ++ ++ if (task_queued(p)) { ++ /* ++ * Because __kthread_bind() calls this on blocked tasks without ++ * holding rq->lock. ++ */ ++ lockdep_assert_held(rq->lock); ++ } ++} ++ ++/* ++ * Calling do_set_cpus_allowed from outside the scheduler code should not be ++ * called on a running or queued task. We should be holding pi_lock. ++ */ ++void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask) ++{ ++ __do_set_cpus_allowed(p, new_mask); ++ if (needs_other_cpu(p, task_cpu(p))) { ++ struct rq *rq; ++ ++ rq = __task_rq_lock(p, NULL); ++ set_task_cpu(p, valid_task_cpu(p)); ++ resched_task(p); ++ __task_rq_unlock(rq, NULL); ++ } ++} ++#endif ++ ++/** ++ * init_idle - set up an idle thread for a given CPU ++ * @idle: task in question ++ * @cpu: cpu the idle task belongs to ++ * ++ * NOTE: this function does not set the idle thread's NEED_RESCHED ++ * flag, to make booting more robust. ++ */ ++void init_idle(struct task_struct *idle, int cpu) ++{ ++ struct rq *rq = cpu_rq(cpu); ++ unsigned long flags; ++ ++ raw_spin_lock_irqsave(&idle->pi_lock, flags); ++ raw_spin_lock(rq->lock); ++ idle->last_ran = rq->niffies; ++ time_slice_expired(idle, rq); ++ idle->state = TASK_RUNNING; ++ /* Setting prio to illegal value shouldn't matter when never queued */ ++ idle->prio = PRIO_LIMIT; ++ ++ kasan_unpoison_task_stack(idle); ++ ++#ifdef CONFIG_SMP ++ /* ++ * It's possible that init_idle() gets called multiple times on a task, ++ * in that case do_set_cpus_allowed() will not do the right thing. ++ * ++ * And since this is boot we can forgo the serialisation. ++ */ ++ set_cpus_allowed_common(idle, cpumask_of(cpu)); ++#ifdef CONFIG_SMT_NICE ++ idle->smt_bias = 0; ++#endif ++#endif ++ set_rq_task(rq, idle); ++ ++ /* Silence PROVE_RCU */ ++ rcu_read_lock(); ++ set_task_cpu(idle, cpu); ++ rcu_read_unlock(); ++ ++ rq->idle = idle; ++ rcu_assign_pointer(rq->curr, idle); ++ idle->on_rq = TASK_ON_RQ_QUEUED; ++ raw_spin_unlock(rq->lock); ++ raw_spin_unlock_irqrestore(&idle->pi_lock, flags); ++ ++ /* Set the preempt count _outside_ the spinlocks! */ ++ init_idle_preempt_count(idle, cpu); ++ ++ ftrace_graph_init_idle_task(idle, cpu); ++ vtime_init_idle(idle, cpu); ++#ifdef CONFIG_SMP ++ sprintf(idle->comm, "%s/%d", INIT_TASK_COMM, cpu); ++#endif ++} ++ ++int cpuset_cpumask_can_shrink(const struct cpumask __maybe_unused *cur, ++ const struct cpumask __maybe_unused *trial) ++{ ++ return 1; ++} ++ ++int task_can_attach(struct task_struct *p, ++ const struct cpumask *cs_cpus_allowed) ++{ ++ int ret = 0; ++ ++ /* ++ * Kthreads which disallow setaffinity shouldn't be moved ++ * to a new cpuset; we don't want to change their CPU ++ * affinity and isolating such threads by their set of ++ * allowed nodes is unnecessary. Thus, cpusets are not ++ * applicable for such threads. This prevents checking for ++ * success of set_cpus_allowed_ptr() on all attached tasks ++ * before cpus_mask may be changed. ++ */ ++ if (p->flags & PF_NO_SETAFFINITY) ++ ret = -EINVAL; ++ ++ return ret; ++} ++ ++void resched_cpu(int cpu) ++{ ++ struct rq *rq = cpu_rq(cpu); ++ struct rq_flags rf; ++ ++ rq_lock_irqsave(rq, &rf); ++ if (cpu_online(cpu) || cpu == smp_processor_id()) ++ resched_curr(rq); ++ rq_unlock_irqrestore(rq, &rf); ++} ++ ++#ifdef CONFIG_SMP ++#ifdef CONFIG_NO_HZ_COMMON ++void select_nohz_load_balancer(int stop_tick) ++{ ++} ++ ++void set_cpu_sd_state_idle(void) {} ++void nohz_balance_enter_idle(int cpu) {} ++ ++/* ++ * In the semi idle case, use the nearest busy CPU for migrating timers ++ * from an idle CPU. This is good for power-savings. ++ * ++ * We don't do similar optimization for completely idle system, as ++ * selecting an idle CPU will add more delays to the timers than intended ++ * (as that CPU's timer base may not be uptodate wrt jiffies etc). ++ */ ++int get_nohz_timer_target(void) ++{ ++ int i, cpu = smp_processor_id(); ++ struct sched_domain *sd; ++ ++ if (!idle_cpu(cpu) && housekeeping_cpu(cpu, HK_FLAG_TIMER)) ++ return cpu; ++ ++ rcu_read_lock(); ++ for_each_domain(cpu, sd) { ++ for_each_cpu(i, sched_domain_span(sd)) { ++ if (cpu == i) ++ continue; ++ ++ if (!idle_cpu(i) && housekeeping_cpu(i, HK_FLAG_TIMER)) { ++ cpu = i; ++ cpu = i; ++ goto unlock; ++ } ++ } ++ } ++ ++ if (!housekeeping_cpu(cpu, HK_FLAG_TIMER)) ++ cpu = housekeeping_any_cpu(HK_FLAG_TIMER); ++unlock: ++ rcu_read_unlock(); ++ return cpu; ++} ++ ++/* ++ * When add_timer_on() enqueues a timer into the timer wheel of an ++ * idle CPU then this timer might expire before the next timer event ++ * which is scheduled to wake up that CPU. In case of a completely ++ * idle system the next event might even be infinite time into the ++ * future. wake_up_idle_cpu() ensures that the CPU is woken up and ++ * leaves the inner idle loop so the newly added timer is taken into ++ * account when the CPU goes back to idle and evaluates the timer ++ * wheel for the next timer event. ++ */ ++void wake_up_idle_cpu(int cpu) ++{ ++ if (cpu == smp_processor_id()) ++ return; ++ ++ if (set_nr_and_not_polling(cpu_rq(cpu)->idle)) ++ smp_sched_reschedule(cpu); ++ else ++ trace_sched_wake_idle_without_ipi(cpu); ++} ++ ++static bool wake_up_full_nohz_cpu(int cpu) ++{ ++ /* ++ * We just need the target to call irq_exit() and re-evaluate ++ * the next tick. The nohz full kick at least implies that. ++ * If needed we can still optimize that later with an ++ * empty IRQ. ++ */ ++ if (cpu_is_offline(cpu)) ++ return true; /* Don't try to wake offline CPUs. */ ++ if (tick_nohz_full_cpu(cpu)) { ++ if (cpu != smp_processor_id() || ++ tick_nohz_tick_stopped()) ++ tick_nohz_full_kick_cpu(cpu); ++ return true; ++ } ++ ++ return false; ++} ++ ++/* ++ * Wake up the specified CPU. If the CPU is going offline, it is the ++ * caller's responsibility to deal with the lost wakeup, for example, ++ * by hooking into the CPU_DEAD notifier like timers and hrtimers do. ++ */ ++void wake_up_nohz_cpu(int cpu) ++{ ++ if (!wake_up_full_nohz_cpu(cpu)) ++ wake_up_idle_cpu(cpu); ++} ++#endif /* CONFIG_NO_HZ_COMMON */ ++ ++/* ++ * Change a given task's CPU affinity. Migrate the thread to a ++ * proper CPU and schedule it away if the CPU it's executing on ++ * is removed from the allowed bitmask. ++ * ++ * NOTE: the caller must have a valid reference to the task, the ++ * task must not exit() & deallocate itself prematurely. The ++ * call is not atomic; no spinlocks may be held. ++ */ ++static int __set_cpus_allowed_ptr(struct task_struct *p, ++ const struct cpumask *new_mask, bool check) ++{ ++ const struct cpumask *cpu_valid_mask = cpu_active_mask; ++ bool queued = false, running_wrong = false, kthread; ++ struct cpumask old_mask; ++ unsigned int dest_cpu; ++ struct rq_flags rf; ++ struct rq *rq; ++ int ret = 0; ++ ++ rq = task_rq_lock(p, &rf); ++ update_rq_clock(rq); ++ ++ kthread = !!(p->flags & PF_KTHREAD); ++ if (kthread) { ++ /* ++ * Kernel threads are allowed on online && !active CPUs ++ */ ++ cpu_valid_mask = cpu_online_mask; ++ } ++ ++ /* ++ * Must re-check here, to close a race against __kthread_bind(), ++ * sched_setaffinity() is not guaranteed to observe the flag. ++ */ ++ if (check && (p->flags & PF_NO_SETAFFINITY)) { ++ ret = -EINVAL; ++ goto out; ++ } ++ ++ cpumask_copy(&old_mask, p->cpus_ptr); ++ if (cpumask_equal(&old_mask, new_mask)) ++ goto out; ++ ++ dest_cpu = cpumask_any_and(cpu_valid_mask, new_mask); ++ if (dest_cpu >= nr_cpu_ids) { ++ ret = -EINVAL; ++ goto out; ++ } ++ ++ queued = task_queued(p); ++ __do_set_cpus_allowed(p, new_mask); ++ ++ if (kthread) { ++ /* ++ * For kernel threads that do indeed end up on online && ++ * !active we want to ensure they are strict per-CPU threads. ++ */ ++ WARN_ON(cpumask_intersects(new_mask, cpu_online_mask) && ++ !cpumask_intersects(new_mask, cpu_active_mask) && ++ p->nr_cpus_allowed != 1); ++ } ++ ++ /* Can the task run on the task's current CPU? If so, we're done */ ++ if (cpumask_test_cpu(task_cpu(p), new_mask)) ++ goto out; ++ ++ if (task_running(rq, p)) { ++ /* Task is running on the wrong cpu now, reschedule it. */ ++ if (rq == this_rq()) { ++ set_task_cpu(p, dest_cpu); ++ set_tsk_need_resched(p); ++ running_wrong = true; ++ } else ++ resched_task(p); ++ } else { ++ if (queued) { ++ /* ++ * Switch runqueue locks after dequeueing the task ++ * here while still holding the pi_lock to be holding ++ * the correct lock for enqueueing. ++ */ ++ dequeue_task(rq, p, 0); ++ rq_unlock(rq); ++ ++ rq = cpu_rq(dest_cpu); ++ rq_lock(rq); ++ } ++ set_task_cpu(p, dest_cpu); ++ if (queued) ++ enqueue_task(rq, p, 0); ++ } ++ if (queued) ++ try_preempt(p, rq); ++ if (running_wrong) ++ preempt_disable(); ++out: ++ task_rq_unlock(rq, p, &rf); ++ ++ if (running_wrong) { ++ __schedule(true); ++ preempt_enable(); ++ } ++ ++ return ret; ++} ++ ++int set_cpus_allowed_ptr(struct task_struct *p, const struct cpumask *new_mask) ++{ ++ return __set_cpus_allowed_ptr(p, new_mask, false); ++} ++EXPORT_SYMBOL_GPL(set_cpus_allowed_ptr); ++ ++#ifdef CONFIG_HOTPLUG_CPU ++/* ++ * Run through task list and find tasks affined to the dead cpu, then remove ++ * that cpu from the list, enable cpu0 and set the zerobound flag. Must hold ++ * cpu 0 and src_cpu's runqueue locks. We should be holding both rq lock and ++ * pi_lock to change cpus_mask but it's not going to matter here. ++ */ ++static void bind_zero(int src_cpu) ++{ ++ struct task_struct *p, *t; ++ struct rq *rq0; ++ int bound = 0; ++ ++ if (src_cpu == 0) ++ return; ++ ++ rq0 = cpu_rq(0); ++ ++ do_each_thread(t, p) { ++ if (cpumask_test_cpu(src_cpu, p->cpus_ptr)) { ++ bool local = (task_cpu(p) == src_cpu); ++ struct rq *rq = task_rq(p); ++ ++ /* task_running is the cpu stopper thread */ ++ if (local && task_running(rq, p)) ++ continue; ++ atomic_clear_cpu(src_cpu, &p->cpus_mask); ++ atomic_set_cpu(0, &p->cpus_mask); ++ p->zerobound = true; ++ bound++; ++ if (local) { ++ bool queued = task_queued(p); ++ ++ if (queued) ++ dequeue_task(rq, p, 0); ++ set_task_cpu(p, 0); ++ if (queued) ++ enqueue_task(rq0, p, 0); ++ } ++ } ++ } while_each_thread(t, p); ++ ++ if (bound) { ++ printk(KERN_INFO "MuQSS removed affinity for %d processes to cpu %d\n", ++ bound, src_cpu); ++ } ++} ++ ++/* Find processes with the zerobound flag and reenable their affinity for the ++ * CPU coming alive. */ ++static void unbind_zero(int src_cpu) ++{ ++ int unbound = 0, zerobound = 0; ++ struct task_struct *p, *t; ++ ++ if (src_cpu == 0) ++ return; ++ ++ do_each_thread(t, p) { ++ if (!p->mm) ++ p->zerobound = false; ++ if (p->zerobound) { ++ unbound++; ++ cpumask_set_cpu(src_cpu, &p->cpus_mask); ++ /* Once every CPU affinity has been re-enabled, remove ++ * the zerobound flag */ ++ if (cpumask_subset(cpu_possible_mask, p->cpus_ptr)) { ++ p->zerobound = false; ++ zerobound++; ++ } ++ } ++ } while_each_thread(t, p); ++ ++ if (unbound) { ++ printk(KERN_INFO "MuQSS added affinity for %d processes to cpu %d\n", ++ unbound, src_cpu); ++ } ++ if (zerobound) { ++ printk(KERN_INFO "MuQSS released forced binding to cpu0 for %d processes\n", ++ zerobound); ++ } ++} ++ ++/* ++ * Ensure that the idle task is using init_mm right before its cpu goes ++ * offline. ++ */ ++void idle_task_exit(void) ++{ ++ struct mm_struct *mm = current->active_mm; ++ ++ BUG_ON(cpu_online(smp_processor_id())); ++ ++ if (mm != &init_mm) { ++ switch_mm(mm, &init_mm, current); ++ current->active_mm = &init_mm; ++ finish_arch_post_lock_switch(); ++ } ++ mmdrop(mm); ++} ++#else /* CONFIG_HOTPLUG_CPU */ ++static void unbind_zero(int src_cpu) {} ++#endif /* CONFIG_HOTPLUG_CPU */ ++ ++void sched_set_stop_task(int cpu, struct task_struct *stop) ++{ ++ struct sched_param stop_param = { .sched_priority = STOP_PRIO }; ++ struct sched_param start_param = { .sched_priority = 0 }; ++ struct task_struct *old_stop = cpu_rq(cpu)->stop; ++ ++ if (stop) { ++ /* ++ * Make it appear like a SCHED_FIFO task, its something ++ * userspace knows about and won't get confused about. ++ * ++ * Also, it will make PI more or less work without too ++ * much confusion -- but then, stop work should not ++ * rely on PI working anyway. ++ */ ++ sched_setscheduler_nocheck(stop, SCHED_FIFO, &stop_param); ++ } ++ ++ cpu_rq(cpu)->stop = stop; ++ ++ if (old_stop) { ++ /* ++ * Reset it back to a normal scheduling policy so that ++ * it can die in pieces. ++ */ ++ sched_setscheduler_nocheck(old_stop, SCHED_NORMAL, &start_param); ++ } ++} ++ ++#if defined(CONFIG_SCHED_DEBUG) && defined(CONFIG_SYSCTL) ++ ++static struct ctl_table sd_ctl_dir[] = { ++ { ++ .procname = "sched_domain", ++ .mode = 0555, ++ }, ++ {} ++}; ++ ++static struct ctl_table sd_ctl_root[] = { ++ { ++ .procname = "kernel", ++ .mode = 0555, ++ .child = sd_ctl_dir, ++ }, ++ {} ++}; ++ ++static struct ctl_table *sd_alloc_ctl_entry(int n) ++{ ++ struct ctl_table *entry = ++ kcalloc(n, sizeof(struct ctl_table), GFP_KERNEL); ++ ++ return entry; ++} ++ ++static void sd_free_ctl_entry(struct ctl_table **tablep) ++{ ++ struct ctl_table *entry; ++ ++ /* ++ * In the intermediate directories, both the child directory and ++ * procname are dynamically allocated and could fail but the mode ++ * will always be set. In the lowest directory the names are ++ * static strings and all have proc handlers. ++ */ ++ for (entry = *tablep; entry->mode; entry++) { ++ if (entry->child) ++ sd_free_ctl_entry(&entry->child); ++ if (entry->proc_handler == NULL) ++ kfree(entry->procname); ++ } ++ ++ kfree(*tablep); ++ *tablep = NULL; ++} ++ ++static void ++set_table_entry(struct ctl_table *entry, ++ const char *procname, void *data, int maxlen, ++ umode_t mode, proc_handler *proc_handler) ++{ ++ entry->procname = procname; ++ entry->data = data; ++ entry->maxlen = maxlen; ++ entry->mode = mode; ++ entry->proc_handler = proc_handler; ++} ++ ++static struct ctl_table * ++sd_alloc_ctl_domain_table(struct sched_domain *sd) ++{ ++ struct ctl_table *table = sd_alloc_ctl_entry(9); ++ ++ if (table == NULL) ++ return NULL; ++ ++ set_table_entry(&table[0], "min_interval", &sd->min_interval, sizeof(long), 0644, proc_doulongvec_minmax); ++ set_table_entry(&table[1], "max_interval", &sd->max_interval, sizeof(long), 0644, proc_doulongvec_minmax); ++ set_table_entry(&table[2], "busy_factor", &sd->busy_factor, sizeof(int), 0644, proc_dointvec_minmax); ++ set_table_entry(&table[3], "imbalance_pct", &sd->imbalance_pct, sizeof(int), 0644, proc_dointvec_minmax); ++ set_table_entry(&table[4], "cache_nice_tries", &sd->cache_nice_tries, sizeof(int), 0644, proc_dointvec_minmax); ++ set_table_entry(&table[5], "flags", &sd->flags, sizeof(int), 0644, proc_dointvec_minmax); ++ set_table_entry(&table[6], "max_newidle_lb_cost", &sd->max_newidle_lb_cost, sizeof(long), 0644, proc_doulongvec_minmax); ++ set_table_entry(&table[7], "name", sd->name, CORENAME_MAX_SIZE, 0444, proc_dostring); ++ /* &table[8] is terminator */ ++ ++ return table; ++} ++ ++static struct ctl_table *sd_alloc_ctl_cpu_table(int cpu) ++{ ++ struct ctl_table *entry, *table; ++ struct sched_domain *sd; ++ int domain_num = 0, i; ++ char buf[32]; ++ ++ for_each_domain(cpu, sd) ++ domain_num++; ++ entry = table = sd_alloc_ctl_entry(domain_num + 1); ++ if (table == NULL) ++ return NULL; ++ ++ i = 0; ++ for_each_domain(cpu, sd) { ++ snprintf(buf, 32, "domain%d", i); ++ entry->procname = kstrdup(buf, GFP_KERNEL); ++ entry->mode = 0555; ++ entry->child = sd_alloc_ctl_domain_table(sd); ++ entry++; ++ i++; ++ } ++ return table; ++} ++ ++static cpumask_var_t sd_sysctl_cpus; ++static struct ctl_table_header *sd_sysctl_header; ++ ++void register_sched_domain_sysctl(void) ++{ ++ static struct ctl_table *cpu_entries; ++ static struct ctl_table **cpu_idx; ++ char buf[32]; ++ int i; ++ ++ if (!cpu_entries) { ++ cpu_entries = sd_alloc_ctl_entry(num_possible_cpus() + 1); ++ if (!cpu_entries) ++ return; ++ ++ WARN_ON(sd_ctl_dir[0].child); ++ sd_ctl_dir[0].child = cpu_entries; ++ } ++ ++ if (!cpu_idx) { ++ struct ctl_table *e = cpu_entries; ++ ++ cpu_idx = kcalloc(nr_cpu_ids, sizeof(struct ctl_table*), GFP_KERNEL); ++ if (!cpu_idx) ++ return; ++ ++ /* deal with sparse possible map */ ++ for_each_possible_cpu(i) { ++ cpu_idx[i] = e; ++ e++; ++ } ++ } ++ ++ if (!cpumask_available(sd_sysctl_cpus)) { ++ if (!alloc_cpumask_var(&sd_sysctl_cpus, GFP_KERNEL)) ++ return; ++ ++ /* init to possible to not have holes in @cpu_entries */ ++ cpumask_copy(sd_sysctl_cpus, cpu_possible_mask); ++ } ++ ++ for_each_cpu(i, sd_sysctl_cpus) { ++ struct ctl_table *e = cpu_idx[i]; ++ ++ if (e->child) ++ sd_free_ctl_entry(&e->child); ++ ++ if (!e->procname) { ++ snprintf(buf, 32, "cpu%d", i); ++ e->procname = kstrdup(buf, GFP_KERNEL); ++ } ++ e->mode = 0555; ++ e->child = sd_alloc_ctl_cpu_table(i); ++ ++ __cpumask_clear_cpu(i, sd_sysctl_cpus); ++ } ++ ++ WARN_ON(sd_sysctl_header); ++ sd_sysctl_header = register_sysctl_table(sd_ctl_root); ++} ++ ++void dirty_sched_domain_sysctl(int cpu) ++{ ++ if (cpumask_available(sd_sysctl_cpus)) ++ __cpumask_set_cpu(cpu, sd_sysctl_cpus); ++} ++ ++/* may be called multiple times per register */ ++void unregister_sched_domain_sysctl(void) ++{ ++ unregister_sysctl_table(sd_sysctl_header); ++ sd_sysctl_header = NULL; ++} ++#endif /* CONFIG_SYSCTL */ ++ ++void set_rq_online(struct rq *rq) ++{ ++ if (!rq->online) { ++ cpumask_set_cpu(cpu_of(rq), rq->rd->online); ++ rq->online = true; ++ } ++} ++ ++void set_rq_offline(struct rq *rq) ++{ ++ if (rq->online) { ++ int cpu = cpu_of(rq); ++ ++ cpumask_clear_cpu(cpu, rq->rd->online); ++ rq->online = false; ++ clear_cpuidle_map(cpu); ++ } ++} ++ ++/* ++ * used to mark begin/end of suspend/resume: ++ */ ++static int num_cpus_frozen; ++ ++/* ++ * Update cpusets according to cpu_active mask. If cpusets are ++ * disabled, cpuset_update_active_cpus() becomes a simple wrapper ++ * around partition_sched_domains(). ++ * ++ * If we come here as part of a suspend/resume, don't touch cpusets because we ++ * want to restore it back to its original state upon resume anyway. ++ */ ++static void cpuset_cpu_active(void) ++{ ++ if (cpuhp_tasks_frozen) { ++ /* ++ * num_cpus_frozen tracks how many CPUs are involved in suspend ++ * resume sequence. As long as this is not the last online ++ * operation in the resume sequence, just build a single sched ++ * domain, ignoring cpusets. ++ */ ++ partition_sched_domains(1, NULL, NULL); ++ if (--num_cpus_frozen) ++ return; ++ /* ++ * This is the last CPU online operation. So fall through and ++ * restore the original sched domains by considering the ++ * cpuset configurations. ++ */ ++ cpuset_force_rebuild(); ++ } ++ ++ cpuset_update_active_cpus(); ++} ++ ++static int cpuset_cpu_inactive(unsigned int cpu) ++{ ++ if (!cpuhp_tasks_frozen) { ++ cpuset_update_active_cpus(); ++ } else { ++ num_cpus_frozen++; ++ partition_sched_domains(1, NULL, NULL); ++ } ++ return 0; ++} ++ ++int sched_cpu_activate(unsigned int cpu) ++{ ++ struct rq *rq = cpu_rq(cpu); ++ struct rq_flags rf; ++ ++#ifdef CONFIG_SCHED_SMT ++ /* ++ * When going up, increment the number of cores with SMT present. ++ */ ++ if (cpumask_weight(cpu_smt_mask(cpu)) == 2) ++ static_branch_inc_cpuslocked(&sched_smt_present); ++#endif ++ set_cpu_active(cpu, true); ++ ++ if (sched_smp_initialized) { ++ sched_domains_numa_masks_set(cpu); ++ cpuset_cpu_active(); ++ } ++ ++ /* ++ * Put the rq online, if not already. This happens: ++ * ++ * 1) In the early boot process, because we build the real domains ++ * after all CPUs have been brought up. ++ * ++ * 2) At runtime, if cpuset_cpu_active() fails to rebuild the ++ * domains. ++ */ ++ rq_lock_irqsave(rq, &rf); ++ if (rq->rd) { ++ BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span)); ++ set_rq_online(rq); ++ } ++ unbind_zero(cpu); ++ rq_unlock_irqrestore(rq, &rf); ++ ++ return 0; ++} ++ ++int sched_cpu_deactivate(unsigned int cpu) ++{ ++ int ret; ++ ++ set_cpu_active(cpu, false); ++ /* ++ * We've cleared cpu_active_mask, wait for all preempt-disabled and RCU ++ * users of this state to go away such that all new such users will ++ * observe it. ++ * ++ * Do sync before park smpboot threads to take care the rcu boost case. ++ */ ++ synchronize_rcu(); ++ ++#ifdef CONFIG_SCHED_SMT ++ /* ++ * When going down, decrement the number of cores with SMT present. ++ */ ++ if (cpumask_weight(cpu_smt_mask(cpu)) == 2) ++ static_branch_dec_cpuslocked(&sched_smt_present); ++#endif ++ ++ if (!sched_smp_initialized) ++ return 0; ++ ++ ret = cpuset_cpu_inactive(cpu); ++ if (ret) { ++ set_cpu_active(cpu, true); ++ return ret; ++ } ++ sched_domains_numa_masks_clear(cpu); ++ return 0; ++} ++ ++int sched_cpu_starting(unsigned int cpu) ++{ ++ sched_tick_start(cpu); ++ return 0; ++} ++ ++#ifdef CONFIG_HOTPLUG_CPU ++int sched_cpu_dying(unsigned int cpu) ++{ ++ struct rq *rq = cpu_rq(cpu); ++ unsigned long flags; ++ ++ /* Handle pending wakeups and then migrate everything off */ ++ sched_ttwu_pending(); ++ sched_tick_stop(cpu); ++ ++ local_irq_save(flags); ++ double_rq_lock(rq, cpu_rq(0)); ++ if (rq->rd) { ++ BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span)); ++ set_rq_offline(rq); ++ } ++ bind_zero(cpu); ++ double_rq_unlock(rq, cpu_rq(0)); ++ sched_start_tick(rq, cpu); ++ hrexpiry_clear(rq); ++ local_irq_restore(flags); ++ ++ return 0; ++} ++#endif ++ ++#if defined(CONFIG_SCHED_SMT) || defined(CONFIG_SCHED_MC) ++/* ++ * Cheaper version of the below functions in case support for SMT and MC is ++ * compiled in but CPUs have no siblings. ++ */ ++static bool sole_cpu_idle(struct rq *rq) ++{ ++ return rq_idle(rq); ++} ++#endif ++#ifdef CONFIG_SCHED_SMT ++static const cpumask_t *thread_cpumask(int cpu) ++{ ++ return topology_sibling_cpumask(cpu); ++} ++/* All this CPU's SMT siblings are idle */ ++static bool siblings_cpu_idle(struct rq *rq) ++{ ++ return cpumask_subset(&rq->thread_mask, &cpu_idle_map); ++} ++#endif ++#ifdef CONFIG_SCHED_MC ++static const cpumask_t *core_cpumask(int cpu) ++{ ++ return topology_core_cpumask(cpu); ++} ++/* All this CPU's shared cache siblings are idle */ ++static bool cache_cpu_idle(struct rq *rq) ++{ ++ return cpumask_subset(&rq->core_mask, &cpu_idle_map); ++} ++/* MC siblings CPU mask which share the same LLC */ ++static const cpumask_t *llc_core_cpumask(int cpu) ++{ ++ return per_cpu(cpu_llc_shared_map, cpu); ++} ++#endif ++ ++enum sched_domain_level { ++ SD_LV_NONE = 0, ++ SD_LV_SIBLING, ++ SD_LV_MC, ++ SD_LV_BOOK, ++ SD_LV_CPU, ++ SD_LV_NODE, ++ SD_LV_ALLNODES, ++ SD_LV_MAX ++}; ++ ++void __init sched_init_smp(void) ++{ ++ struct rq *rq, *other_rq, *leader = cpu_rq(0); ++ struct sched_domain *sd; ++ int cpu, other_cpu, i; ++#ifdef CONFIG_SCHED_SMT ++ bool smt_threads = false; ++#endif ++ sched_init_numa(); ++ ++ /* ++ * There's no userspace yet to cause hotplug operations; hence all the ++ * cpu masks are stable and all blatant races in the below code cannot ++ * happen. ++ */ ++ mutex_lock(&sched_domains_mutex); ++ sched_init_domains(cpu_active_mask); ++ mutex_unlock(&sched_domains_mutex); ++ ++ /* Move init over to a non-isolated CPU */ ++ if (set_cpus_allowed_ptr(current, housekeeping_cpumask(HK_FLAG_DOMAIN)) < 0) ++ BUG(); ++ ++ local_irq_disable(); ++ mutex_lock(&sched_domains_mutex); ++ lock_all_rqs(); ++ ++ printk(KERN_INFO "MuQSS possible/present/online CPUs: %d/%d/%d\n", ++ num_possible_cpus(), num_present_cpus(), num_online_cpus()); ++ ++ /* ++ * Set up the relative cache distance of each online cpu from each ++ * other in a simple array for quick lookup. Locality is determined ++ * by the closest sched_domain that CPUs are separated by. CPUs with ++ * shared cache in SMT and MC are treated as local. Separate CPUs ++ * (within the same package or physically) within the same node are ++ * treated as not local. CPUs not even in the same domain (different ++ * nodes) are treated as very distant. ++ */ ++ for (cpu = 0; cpu < num_online_cpus(); cpu++) { ++ rq = cpu_rq(cpu); ++ leader = NULL; ++ /* First check if this cpu is in the same node */ ++ for_each_domain(cpu, sd) { ++ if (sd->level > SD_LV_MC) ++ continue; ++ if (rqshare != RQSHARE_ALL) ++ leader = NULL; ++ /* Set locality to local node if not already found lower */ ++ for_each_cpu(other_cpu, sched_domain_span(sd)) { ++ if (rqshare >= RQSHARE_SMP) { ++ other_rq = cpu_rq(other_cpu); ++ ++ /* Set the smp_leader to the first CPU */ ++ if (!leader) ++ leader = rq; ++ other_rq->smp_leader = leader; ++ } ++ if (rq->cpu_locality[other_cpu] > LOCALITY_SMP) ++ rq->cpu_locality[other_cpu] = LOCALITY_SMP; ++ } ++ } ++ ++ /* ++ * Each runqueue has its own function in case it doesn't have ++ * siblings of its own allowing mixed topologies. ++ */ ++#ifdef CONFIG_SCHED_MC ++ leader = NULL; ++ if (cpumask_weight(core_cpumask(cpu)) > 1) { ++ cpumask_copy(&rq->core_mask, llc_core_cpumask(cpu)); ++ cpumask_clear_cpu(cpu, &rq->core_mask); ++ for_each_cpu(other_cpu, core_cpumask(cpu)) { ++ if (rqshare == RQSHARE_MC || ++ (rqshare == RQSHARE_MC_LLC && cpumask_test_cpu(other_cpu, llc_core_cpumask(cpu)))) { ++ other_rq = cpu_rq(other_cpu); ++ ++ /* Set the mc_leader to the first CPU */ ++ if (!leader) ++ leader = rq; ++ other_rq->mc_leader = leader; ++ } ++ if (rq->cpu_locality[other_cpu] > LOCALITY_MC) { ++ /* this is to get LLC into play even in case LLC sharing is not used */ ++ if (cpumask_test_cpu(other_cpu, llc_core_cpumask(cpu))) ++ rq->cpu_locality[other_cpu] = LOCALITY_MC_LLC; ++ else ++ rq->cpu_locality[other_cpu] = LOCALITY_MC; ++ } ++ } ++ rq->cache_idle = cache_cpu_idle; ++ } ++#endif ++#ifdef CONFIG_SCHED_SMT ++ leader = NULL; ++ if (cpumask_weight(thread_cpumask(cpu)) > 1) { ++ cpumask_copy(&rq->thread_mask, thread_cpumask(cpu)); ++ cpumask_clear_cpu(cpu, &rq->thread_mask); ++ for_each_cpu(other_cpu, thread_cpumask(cpu)) { ++ if (rqshare == RQSHARE_SMT) { ++ other_rq = cpu_rq(other_cpu); ++ ++ /* Set the smt_leader to the first CPU */ ++ if (!leader) ++ leader = rq; ++ other_rq->smt_leader = leader; ++ } ++ if (rq->cpu_locality[other_cpu] > LOCALITY_SMT) ++ rq->cpu_locality[other_cpu] = LOCALITY_SMT; ++ } ++ rq->siblings_idle = siblings_cpu_idle; ++ smt_threads = true; ++ } ++#endif ++ } ++ ++#ifdef CONFIG_SMT_NICE ++ if (smt_threads) { ++ check_siblings = &check_smt_siblings; ++ wake_siblings = &wake_smt_siblings; ++ smt_schedule = &smt_should_schedule; ++ } ++#endif ++ unlock_all_rqs(); ++ mutex_unlock(&sched_domains_mutex); ++ ++ for_each_online_cpu(cpu) { ++ rq = cpu_rq(cpu); ++ for_each_online_cpu(other_cpu) { ++ printk(KERN_DEBUG "MuQSS locality CPU %d to %d: %d\n", cpu, other_cpu, rq->cpu_locality[other_cpu]); ++ } ++ } ++ ++ for_each_online_cpu(cpu) { ++ rq = cpu_rq(cpu); ++ leader = rq->smp_leader; ++ ++ rq_lock(rq); ++ if (leader && rq != leader) { ++ printk(KERN_INFO "MuQSS sharing SMP runqueue from CPU %d to CPU %d\n", ++ leader->cpu, rq->cpu); ++ kfree(rq->node); ++ kfree(rq->sl); ++ kfree(rq->lock); ++ rq->node = leader->node; ++ rq->sl = leader->sl; ++ rq->lock = leader->lock; ++ barrier(); ++ /* To make up for not unlocking the freed runlock */ ++ preempt_enable(); ++ } else ++ rq_unlock(rq); ++ } ++ ++#ifdef CONFIG_SCHED_MC ++ for_each_online_cpu(cpu) { ++ rq = cpu_rq(cpu); ++ leader = rq->mc_leader; ++ ++ rq_lock(rq); ++ if (leader && rq != leader) { ++ printk(KERN_INFO "MuQSS sharing MC runqueue from CPU %d to CPU %d\n", ++ leader->cpu, rq->cpu); ++ kfree(rq->node); ++ kfree(rq->sl); ++ kfree(rq->lock); ++ rq->node = leader->node; ++ rq->sl = leader->sl; ++ rq->lock = leader->lock; ++ barrier(); ++ /* To make up for not unlocking the freed runlock */ ++ preempt_enable(); ++ } else ++ rq_unlock(rq); ++ } ++#endif /* CONFIG_SCHED_MC */ ++ ++#ifdef CONFIG_SCHED_SMT ++ for_each_online_cpu(cpu) { ++ rq = cpu_rq(cpu); ++ ++ leader = rq->smt_leader; ++ ++ rq_lock(rq); ++ if (leader && rq != leader) { ++ printk(KERN_INFO "MuQSS sharing SMT runqueue from CPU %d to CPU %d\n", ++ leader->cpu, rq->cpu); ++ kfree(rq->node); ++ kfree(rq->sl); ++ kfree(rq->lock); ++ rq->node = leader->node; ++ rq->sl = leader->sl; ++ rq->lock = leader->lock; ++ barrier(); ++ /* To make up for not unlocking the freed runlock */ ++ preempt_enable(); ++ } else ++ rq_unlock(rq); ++ } ++#endif /* CONFIG_SCHED_SMT */ ++ ++ local_irq_enable(); ++ ++ total_runqueues = 0; ++ for_each_online_cpu(cpu) { ++ int locality, total_rqs = 0, total_cpus = 0; ++ ++ rq = cpu_rq(cpu); ++ if ( ++#ifdef CONFIG_SCHED_MC ++ (rq->mc_leader == rq) && ++#endif ++#ifdef CONFIG_SCHED_SMT ++ (rq->smt_leader == rq) && ++#endif ++ (rq->smp_leader == rq)) { ++ total_runqueues++; ++ } ++ ++ for (locality = LOCALITY_SAME; locality <= LOCALITY_DISTANT; locality++) { ++ int selected_cpus[NR_CPUS], selected_cpu_cnt, selected_cpu_idx, test_cpu_idx, cpu_idx, best_locality, test_cpu; ++ int ordered_cpus[NR_CPUS], ordered_cpus_idx; ++ ++ ordered_cpus_idx = -1; ++ selected_cpu_cnt = 0; ++ ++ for_each_online_cpu(test_cpu) { ++ if (cpu < num_online_cpus() / 2) ++ other_cpu = cpu + test_cpu; ++ else ++ other_cpu = cpu - test_cpu; ++ if (other_cpu < 0) ++ other_cpu += num_online_cpus(); ++ else ++ other_cpu %= num_online_cpus(); ++ /* gather CPUs of the same locality */ ++ if (rq->cpu_locality[other_cpu] == locality) { ++ selected_cpus[selected_cpu_cnt] = other_cpu; ++ selected_cpu_cnt++; ++ } ++ } ++ ++ /* reserve first CPU as starting point */ ++ if (selected_cpu_cnt > 0) { ++ ordered_cpus_idx++; ++ ordered_cpus[ordered_cpus_idx] = selected_cpus[ordered_cpus_idx]; ++ selected_cpus[ordered_cpus_idx] = -1; ++ } ++ ++ /* take each CPU and sort it within the same locality based on each inter-CPU localities */ ++ for(test_cpu_idx = 1; test_cpu_idx < selected_cpu_cnt; test_cpu_idx++) { ++ /* starting point with worst locality and current CPU */ ++ best_locality = LOCALITY_DISTANT; ++ selected_cpu_idx = test_cpu_idx; ++ ++ /* try to find the best locality within group */ ++ for(cpu_idx = 1; cpu_idx < selected_cpu_cnt; cpu_idx++) { ++ /* if CPU has not been used and locality is better */ ++ if (selected_cpus[cpu_idx] > -1) { ++ other_rq = cpu_rq(ordered_cpus[ordered_cpus_idx]); ++ if (best_locality > other_rq->cpu_locality[selected_cpus[cpu_idx]]) { ++ /* assign best locality and best CPU idx in array */ ++ best_locality = other_rq->cpu_locality[selected_cpus[cpu_idx]]; ++ selected_cpu_idx = cpu_idx; ++ } ++ } ++ } ++ ++ /* add our next best CPU to ordered list */ ++ ordered_cpus_idx++; ++ ordered_cpus[ordered_cpus_idx] = selected_cpus[selected_cpu_idx]; ++ /* mark this CPU as used */ ++ selected_cpus[selected_cpu_idx] = -1; ++ } ++ ++ /* set up RQ and CPU orders */ ++ for (test_cpu = 0; test_cpu <= ordered_cpus_idx; test_cpu++) { ++ other_rq = cpu_rq(ordered_cpus[test_cpu]); ++ /* set up cpu orders */ ++ rq->cpu_order[total_cpus++] = other_rq; ++ if ( ++#ifdef CONFIG_SCHED_MC ++ (other_rq->mc_leader == other_rq) && ++#endif ++#ifdef CONFIG_SCHED_SMT ++ (other_rq->smt_leader == other_rq) && ++#endif ++ (other_rq->smp_leader == other_rq)) { ++ /* set up RQ orders */ ++ rq->rq_order[total_rqs++] = other_rq; ++ } ++ } ++ } ++ } ++ ++ for_each_online_cpu(cpu) { ++ rq = cpu_rq(cpu); ++ for (i = 0; i < total_runqueues; i++) { ++ printk(KERN_DEBUG "MuQSS CPU %d llc %d RQ order %d RQ %d llc %d\n", cpu, per_cpu(cpu_llc_id, cpu), i, ++ rq->rq_order[i]->cpu, per_cpu(cpu_llc_id, rq->rq_order[i]->cpu)); ++ } ++ } ++ ++ for_each_online_cpu(cpu) { ++ rq = cpu_rq(cpu); ++ for (i = 0; i < num_online_cpus(); i++) { ++ printk(KERN_DEBUG "MuQSS CPU %d llc %d CPU order %d RQ %d llc %d\n", cpu, per_cpu(cpu_llc_id, cpu), i, ++ rq->cpu_order[i]->cpu, per_cpu(cpu_llc_id, rq->cpu_order[i]->cpu)); ++ } ++ } ++ ++ switch (rqshare) { ++ case RQSHARE_ALL: ++ /* This should only ever read 1 */ ++ printk(KERN_INFO "MuQSS runqueue share type ALL total runqueues: %d\n", ++ total_runqueues); ++ break; ++ case RQSHARE_SMP: ++ printk(KERN_INFO "MuQSS runqueue share type SMP total runqueues: %d\n", ++ total_runqueues); ++ break; ++ case RQSHARE_MC: ++ printk(KERN_INFO "MuQSS runqueue share type MC total runqueues: %d\n", ++ total_runqueues); ++ break; ++ case RQSHARE_MC_LLC: ++ printk(KERN_INFO "MuQSS runqueue share type LLC total runqueues: %d\n", ++ total_runqueues); ++ break; ++ case RQSHARE_SMT: ++ printk(KERN_INFO "MuQSS runqueue share type SMT total runqueues: %d\n", ++ total_runqueues); ++ break; ++ case RQSHARE_NONE: ++ printk(KERN_INFO "MuQSS runqueue share type NONE total runqueues: %d\n", ++ total_runqueues); ++ break; ++ } ++ ++ sched_smp_initialized = true; ++} ++#else ++void __init sched_init_smp(void) ++{ ++ sched_smp_initialized = true; ++} ++#endif /* CONFIG_SMP */ ++ ++int in_sched_functions(unsigned long addr) ++{ ++ return in_lock_functions(addr) || ++ (addr >= (unsigned long)__sched_text_start ++ && addr < (unsigned long)__sched_text_end); ++} ++ ++#ifdef CONFIG_CGROUP_SCHED ++/* task group related information */ ++struct task_group { ++ struct cgroup_subsys_state css; ++ ++ struct rcu_head rcu; ++ struct list_head list; ++ ++ struct task_group *parent; ++ struct list_head siblings; ++ struct list_head children; ++}; ++ ++/* ++ * Default task group. ++ * Every task in system belongs to this group at bootup. ++ */ ++struct task_group root_task_group; ++LIST_HEAD(task_groups); ++ ++/* Cacheline aligned slab cache for task_group */ ++static struct kmem_cache *task_group_cache __read_mostly; ++#endif /* CONFIG_CGROUP_SCHED */ ++ ++void __init sched_init(void) ++{ ++#ifdef CONFIG_SMP ++ int cpu_ids; ++#endif ++ int i; ++ struct rq *rq; ++ ++ wait_bit_init(); ++ ++ prio_ratios[0] = 128; ++ for (i = 1 ; i < NICE_WIDTH ; i++) ++ prio_ratios[i] = prio_ratios[i - 1] * 11 / 10; ++ ++ skiplist_node_init(&init_task.node); ++ ++#ifdef CONFIG_SMP ++ init_defrootdomain(); ++ cpumask_clear(&cpu_idle_map); ++#else ++ uprq = &per_cpu(runqueues, 0); ++#endif ++ ++#ifdef CONFIG_CGROUP_SCHED ++ task_group_cache = KMEM_CACHE(task_group, 0); ++ ++ list_add(&root_task_group.list, &task_groups); ++ INIT_LIST_HEAD(&root_task_group.children); ++ INIT_LIST_HEAD(&root_task_group.siblings); ++#endif /* CONFIG_CGROUP_SCHED */ ++ for_each_possible_cpu(i) { ++ rq = cpu_rq(i); ++ rq->node = kmalloc(sizeof(skiplist_node), GFP_ATOMIC); ++ skiplist_init(rq->node); ++ rq->sl = new_skiplist(rq->node); ++ rq->lock = kmalloc(sizeof(raw_spinlock_t), GFP_ATOMIC); ++ raw_spin_lock_init(rq->lock); ++ rq->nr_running = 0; ++ rq->nr_uninterruptible = 0; ++ rq->nr_switches = 0; ++ rq->clock = rq->old_clock = rq->last_niffy = rq->niffies = 0; ++ rq->last_jiffy = jiffies; ++ rq->user_ns = rq->nice_ns = rq->softirq_ns = rq->system_ns = ++ rq->iowait_ns = rq->idle_ns = 0; ++ rq->dither = 0; ++ set_rq_task(rq, &init_task); ++ rq->iso_ticks = 0; ++ rq->iso_refractory = false; ++#ifdef CONFIG_SMP ++ rq->smp_leader = rq; ++#ifdef CONFIG_SCHED_MC ++ rq->mc_leader = rq; ++#endif ++#ifdef CONFIG_SCHED_SMT ++ rq->smt_leader = rq; ++#endif ++ rq->sd = NULL; ++ rq->rd = NULL; ++ rq->online = false; ++ rq->cpu = i; ++ rq_attach_root(rq, &def_root_domain); ++#endif ++ init_rq_hrexpiry(rq); ++ atomic_set(&rq->nr_iowait, 0); ++ } ++ ++#ifdef CONFIG_SMP ++ cpu_ids = i; ++ /* ++ * Set the base locality for cpu cache distance calculation to ++ * "distant" (3). Make sure the distance from a CPU to itself is 0. ++ */ ++ for_each_possible_cpu(i) { ++ int j; ++ ++ rq = cpu_rq(i); ++#ifdef CONFIG_SCHED_SMT ++ rq->siblings_idle = sole_cpu_idle; ++#endif ++#ifdef CONFIG_SCHED_MC ++ rq->cache_idle = sole_cpu_idle; ++#endif ++ rq->cpu_locality = kmalloc(cpu_ids * sizeof(int *), GFP_ATOMIC); ++ for_each_possible_cpu(j) { ++ if (i == j) ++ rq->cpu_locality[j] = LOCALITY_SAME; ++ else ++ rq->cpu_locality[j] = LOCALITY_DISTANT; ++ } ++ rq->rq_order = kmalloc(cpu_ids * sizeof(struct rq *), GFP_ATOMIC); ++ rq->cpu_order = kmalloc(cpu_ids * sizeof(struct rq *), GFP_ATOMIC); ++ rq->rq_order[0] = rq->cpu_order[0] = rq; ++ for (j = 1; j < cpu_ids; j++) ++ rq->rq_order[j] = rq->cpu_order[j] = cpu_rq(j); ++ } ++#endif ++ ++ /* ++ * The boot idle thread does lazy MMU switching as well: ++ */ ++ mmgrab(&init_mm); ++ enter_lazy_tlb(&init_mm, current); ++ ++ /* ++ * Make us the idle thread. Technically, schedule() should not be ++ * called from this thread, however somewhere below it might be, ++ * but because we are the idle thread, we just pick up running again ++ * when this runqueue becomes "idle". ++ */ ++ init_idle(current, smp_processor_id()); ++ ++#ifdef CONFIG_SMP ++ idle_thread_set_boot_cpu(); ++#endif /* SMP */ ++ ++ init_schedstats(); ++ ++ psi_init(); ++} ++ ++#ifdef CONFIG_DEBUG_ATOMIC_SLEEP ++static inline int preempt_count_equals(int preempt_offset) ++{ ++ int nested = preempt_count() + rcu_preempt_depth(); ++ ++ return (nested == preempt_offset); ++} ++ ++void __might_sleep(const char *file, int line, int preempt_offset) ++{ ++ /* ++ * Blocking primitives will set (and therefore destroy) current->state, ++ * since we will exit with TASK_RUNNING make sure we enter with it, ++ * otherwise we will destroy state. ++ */ ++ WARN_ONCE(current->state != TASK_RUNNING && current->task_state_change, ++ "do not call blocking ops when !TASK_RUNNING; " ++ "state=%lx set at [<%p>] %pS\n", ++ current->state, ++ (void *)current->task_state_change, ++ (void *)current->task_state_change); ++ ++ ___might_sleep(file, line, preempt_offset); ++} ++EXPORT_SYMBOL(__might_sleep); ++ ++void __cant_sleep(const char *file, int line, int preempt_offset) ++{ ++ static unsigned long prev_jiffy; ++ ++ if (irqs_disabled()) ++ return; ++ ++ if (!IS_ENABLED(CONFIG_PREEMPT_COUNT)) ++ return; ++ ++ if (preempt_count() > preempt_offset) ++ return; ++ ++ if (time_before(jiffies, prev_jiffy + HZ) && prev_jiffy) ++ return; ++ prev_jiffy = jiffies; ++ ++ printk(KERN_ERR "BUG: assuming atomic context at %s:%d\n", file, line); ++ printk(KERN_ERR "in_atomic(): %d, irqs_disabled(): %d, pid: %d, name: %s\n", ++ in_atomic(), irqs_disabled(), ++ current->pid, current->comm); ++ ++ debug_show_held_locks(current); ++ dump_stack(); ++ add_taint(TAINT_WARN, LOCKDEP_STILL_OK); ++} ++EXPORT_SYMBOL_GPL(__cant_sleep); ++ ++void ___might_sleep(const char *file, int line, int preempt_offset) ++{ ++ /* Ratelimiting timestamp: */ ++ static unsigned long prev_jiffy; ++ ++ unsigned long preempt_disable_ip; ++ ++ /* WARN_ON_ONCE() by default, no rate limit required: */ ++ rcu_sleep_check(); ++ ++ if ((preempt_count_equals(preempt_offset) && !irqs_disabled() && ++ !is_idle_task(current) && !current->non_block_count) || ++ system_state == SYSTEM_BOOTING || system_state > SYSTEM_RUNNING || ++ oops_in_progress) ++ return; ++ ++ if (time_before(jiffies, prev_jiffy + HZ) && prev_jiffy) ++ return; ++ prev_jiffy = jiffies; ++ ++ /* Save this before calling printk(), since that will clobber it: */ ++ preempt_disable_ip = get_preempt_disable_ip(current); ++ ++ printk(KERN_ERR ++ "BUG: sleeping function called from invalid context at %s:%d\n", ++ file, line); ++ printk(KERN_ERR ++ "in_atomic(): %d, irqs_disabled(): %d, non_block: %d, pid: %d, name: %s\n", ++ in_atomic(), irqs_disabled(), current->non_block_count, ++ current->pid, current->comm); ++ ++ if (task_stack_end_corrupted(current)) ++ printk(KERN_EMERG "Thread overran stack, or stack corrupted\n"); ++ ++ debug_show_held_locks(current); ++ if (irqs_disabled()) ++ print_irqtrace_events(current); ++ if (IS_ENABLED(CONFIG_DEBUG_PREEMPT) ++ && !preempt_count_equals(preempt_offset)) { ++ pr_err("Preemption disabled at:"); ++ print_ip_sym(preempt_disable_ip); ++ pr_cont("\n"); ++ } ++ dump_stack(); ++ add_taint(TAINT_WARN, LOCKDEP_STILL_OK); ++} ++EXPORT_SYMBOL(___might_sleep); ++#endif ++ ++#ifdef CONFIG_MAGIC_SYSRQ ++static inline void normalise_rt_tasks(void) ++{ ++ struct sched_attr attr = {}; ++ struct task_struct *g, *p; ++ struct rq_flags rf; ++ struct rq *rq; ++ ++ read_lock(&tasklist_lock); ++ for_each_process_thread(g, p) { ++ /* ++ * Only normalize user tasks: ++ */ ++ if (p->flags & PF_KTHREAD) ++ continue; ++ ++ if (!rt_task(p) && !iso_task(p)) ++ continue; ++ ++ rq = task_rq_lock(p, &rf); ++ __setscheduler(p, rq, SCHED_NORMAL, 0, &attr, false); ++ task_rq_unlock(rq, p, &rf); ++ } ++ read_unlock(&tasklist_lock); ++} ++ ++void normalize_rt_tasks(void) ++{ ++ normalise_rt_tasks(); ++} ++#endif /* CONFIG_MAGIC_SYSRQ */ ++ ++#if defined(CONFIG_IA64) || defined(CONFIG_KGDB_KDB) ++/* ++ * These functions are only useful for the IA64 MCA handling, or kdb. ++ * ++ * They can only be called when the whole system has been ++ * stopped - every CPU needs to be quiescent, and no scheduling ++ * activity can take place. Using them for anything else would ++ * be a serious bug, and as a result, they aren't even visible ++ * under any other configuration. ++ */ ++ ++/** ++ * curr_task - return the current task for a given CPU. ++ * @cpu: the processor in question. ++ * ++ * ONLY VALID WHEN THE WHOLE SYSTEM IS STOPPED! ++ * ++ * Return: The current task for @cpu. ++ */ ++struct task_struct *curr_task(int cpu) ++{ ++ return cpu_curr(cpu); ++} ++ ++#endif /* defined(CONFIG_IA64) || defined(CONFIG_KGDB_KDB) */ ++ ++#ifdef CONFIG_IA64 ++/** ++ * ia64_set_curr_task - set the current task for a given CPU. ++ * @cpu: the processor in question. ++ * @p: the task pointer to set. ++ * ++ * Description: This function must only be used when non-maskable interrupts ++ * are serviced on a separate stack. It allows the architecture to switch the ++ * notion of the current task on a CPU in a non-blocking manner. This function ++ * must be called with all CPU's synchronised, and interrupts disabled, the ++ * and caller must save the original value of the current task (see ++ * curr_task() above) and restore that value before reenabling interrupts and ++ * re-starting the system. ++ * ++ * ONLY VALID WHEN THE WHOLE SYSTEM IS STOPPED! ++ */ ++void ia64_set_curr_task(int cpu, struct task_struct *p) ++{ ++ cpu_curr(cpu) = p; ++} ++ ++#endif ++ ++void init_idle_bootup_task(struct task_struct *idle) ++{} ++ ++#ifdef CONFIG_SCHED_DEBUG ++__read_mostly bool sched_debug_enabled; ++ ++void proc_sched_show_task(struct task_struct *p, struct pid_namespace *ns, ++ struct seq_file *m) ++{ ++ seq_printf(m, "%s (%d, #threads: %d)\n", p->comm, task_pid_nr_ns(p, ns), ++ get_nr_threads(p)); ++} ++ ++void proc_sched_set_task(struct task_struct *p) ++{} ++#endif ++ ++#ifdef CONFIG_CGROUP_SCHED ++static void sched_free_group(struct task_group *tg) ++{ ++ kmem_cache_free(task_group_cache, tg); ++} ++ ++/* allocate runqueue etc for a new task group */ ++struct task_group *sched_create_group(struct task_group *parent) ++{ ++ struct task_group *tg; ++ ++ tg = kmem_cache_alloc(task_group_cache, GFP_KERNEL | __GFP_ZERO); ++ if (!tg) ++ return ERR_PTR(-ENOMEM); ++ ++ return tg; ++} ++ ++void sched_online_group(struct task_group *tg, struct task_group *parent) ++{ ++} ++ ++/* rcu callback to free various structures associated with a task group */ ++static void sched_free_group_rcu(struct rcu_head *rhp) ++{ ++ /* Now it should be safe to free those cfs_rqs */ ++ sched_free_group(container_of(rhp, struct task_group, rcu)); ++} ++ ++void sched_destroy_group(struct task_group *tg) ++{ ++ /* Wait for possible concurrent references to cfs_rqs complete */ ++ call_rcu(&tg->rcu, sched_free_group_rcu); ++} ++ ++void sched_offline_group(struct task_group *tg) ++{ ++} ++ ++static inline struct task_group *css_tg(struct cgroup_subsys_state *css) ++{ ++ return css ? container_of(css, struct task_group, css) : NULL; ++} ++ ++static struct cgroup_subsys_state * ++cpu_cgroup_css_alloc(struct cgroup_subsys_state *parent_css) ++{ ++ struct task_group *parent = css_tg(parent_css); ++ struct task_group *tg; ++ ++ if (!parent) { ++ /* This is early initialization for the top cgroup */ ++ return &root_task_group.css; ++ } ++ ++ tg = sched_create_group(parent); ++ if (IS_ERR(tg)) ++ return ERR_PTR(-ENOMEM); ++ return &tg->css; ++} ++ ++/* Expose task group only after completing cgroup initialization */ ++static int cpu_cgroup_css_online(struct cgroup_subsys_state *css) ++{ ++ struct task_group *tg = css_tg(css); ++ struct task_group *parent = css_tg(css->parent); ++ ++ if (parent) ++ sched_online_group(tg, parent); ++ return 0; ++} ++ ++static void cpu_cgroup_css_released(struct cgroup_subsys_state *css) ++{ ++ struct task_group *tg = css_tg(css); ++ ++ sched_offline_group(tg); ++} ++ ++static void cpu_cgroup_css_free(struct cgroup_subsys_state *css) ++{ ++ struct task_group *tg = css_tg(css); ++ ++ /* ++ * Relies on the RCU grace period between css_released() and this. ++ */ ++ sched_free_group(tg); ++} ++ ++static void cpu_cgroup_fork(struct task_struct *task) ++{ ++} ++ ++static int cpu_cgroup_can_attach(struct cgroup_taskset *tset) ++{ ++ return 0; ++} ++ ++static void cpu_cgroup_attach(struct cgroup_taskset *tset) ++{ ++} ++ ++static struct cftype cpu_legacy_files[] = { ++ { } /* Terminate */ ++}; ++ ++static struct cftype cpu_files[] = { ++ { } /* terminate */ ++}; ++ ++static int cpu_extra_stat_show(struct seq_file *sf, ++ struct cgroup_subsys_state *css) ++{ ++ return 0; ++} ++ ++struct cgroup_subsys cpu_cgrp_subsys = { ++ .css_alloc = cpu_cgroup_css_alloc, ++ .css_online = cpu_cgroup_css_online, ++ .css_released = cpu_cgroup_css_released, ++ .css_free = cpu_cgroup_css_free, ++ .css_extra_stat_show = cpu_extra_stat_show, ++ .fork = cpu_cgroup_fork, ++ .can_attach = cpu_cgroup_can_attach, ++ .attach = cpu_cgroup_attach, ++ .legacy_cftypes = cpu_files, ++ .legacy_cftypes = cpu_legacy_files, ++ .dfl_cftypes = cpu_files, ++ .early_init = true, ++ .threaded = true, ++}; ++#endif /* CONFIG_CGROUP_SCHED */ ++ ++#undef CREATE_TRACE_POINTS +diff --git a/kernel/sched/MuQSS.h b/kernel/sched/MuQSS.h +new file mode 100644 +index 000000000000..cf655482df00 +--- /dev/null ++++ b/kernel/sched/MuQSS.h +@@ -0,0 +1,1012 @@ ++/* SPDX-License-Identifier: GPL-2.0 */ ++#ifndef MUQSS_SCHED_H ++#define MUQSS_SCHED_H ++ ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++ ++#include ++ ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++ ++#ifdef CONFIG_PARAVIRT ++#include ++#endif ++ ++#include "cpupri.h" ++ ++#ifdef CONFIG_SCHED_DEBUG ++# define SCHED_WARN_ON(x) WARN_ONCE(x, #x) ++#else ++# define SCHED_WARN_ON(x) ((void)(x)) ++#endif ++ ++/* ++ * wake flags ++ */ ++#define WF_SYNC 0x01 /* waker goes to sleep after wakeup */ ++#define WF_FORK 0x02 /* child wakeup after fork */ ++#define WF_MIGRATED 0x04 /* internal use, task got migrated */ ++ ++/* task_struct::on_rq states: */ ++#define TASK_ON_RQ_QUEUED 1 ++#define TASK_ON_RQ_MIGRATING 2 ++ ++struct rq; ++ ++#ifdef CONFIG_SMP ++ ++static inline bool sched_asym_prefer(int a, int b) ++{ ++ return arch_asym_cpu_priority(a) > arch_asym_cpu_priority(b); ++} ++ ++struct perf_domain { ++ struct em_perf_domain *em_pd; ++ struct perf_domain *next; ++ struct rcu_head rcu; ++}; ++ ++/* Scheduling group status flags */ ++#define SG_OVERLOAD 0x1 /* More than one runnable task on a CPU. */ ++#define SG_OVERUTILIZED 0x2 /* One or more CPUs are over-utilized. */ ++ ++/* ++ * We add the notion of a root-domain which will be used to define per-domain ++ * variables. Each exclusive cpuset essentially defines an island domain by ++ * fully partitioning the member cpus from any other cpuset. Whenever a new ++ * exclusive cpuset is created, we also create and attach a new root-domain ++ * object. ++ * ++ */ ++struct root_domain { ++ atomic_t refcount; ++ atomic_t rto_count; ++ struct rcu_head rcu; ++ cpumask_var_t span; ++ cpumask_var_t online; ++ ++ /* ++ * Indicate pullable load on at least one CPU, e.g: ++ * - More than one runnable task ++ * - Running task is misfit ++ */ ++ int overload; ++ ++ /* Indicate one or more cpus over-utilized (tipping point) */ ++ int overutilized; ++ ++ /* ++ * The bit corresponding to a CPU gets set here if such CPU has more ++ * than one runnable -deadline task (as it is below for RT tasks). ++ */ ++ cpumask_var_t dlo_mask; ++ atomic_t dlo_count; ++ /* Replace unused CFS structures with void */ ++ //struct dl_bw dl_bw; ++ //struct cpudl cpudl; ++ void *dl_bw; ++ void *cpudl; ++ ++ /* ++ * The "RT overload" flag: it gets set if a CPU has more than ++ * one runnable RT task. ++ */ ++ cpumask_var_t rto_mask; ++ //struct cpupri cpupri; ++ void *cpupri; ++ ++ unsigned long max_cpu_capacity; ++ ++ /* ++ * NULL-terminated list of performance domains intersecting with the ++ * CPUs of the rd. Protected by RCU. ++ */ ++ struct perf_domain *pd; ++}; ++ ++extern void init_defrootdomain(void); ++extern int sched_init_domains(const struct cpumask *cpu_map); ++extern void rq_attach_root(struct rq *rq, struct root_domain *rd); ++ ++static inline void cpupri_cleanup(void __maybe_unused *cpupri) ++{ ++} ++ ++static inline void cpudl_cleanup(void __maybe_unused *cpudl) ++{ ++} ++ ++static inline void init_dl_bw(void __maybe_unused *dl_bw) ++{ ++} ++ ++static inline int cpudl_init(void __maybe_unused *dl_bw) ++{ ++ return 0; ++} ++ ++static inline int cpupri_init(void __maybe_unused *cpupri) ++{ ++ return 0; ++} ++#endif /* CONFIG_SMP */ ++ ++/* ++ * This is the main, per-CPU runqueue data structure. ++ * This data should only be modified by the local cpu. ++ */ ++struct rq { ++ raw_spinlock_t *lock; ++ raw_spinlock_t *orig_lock; ++ ++ struct task_struct *curr, *idle, *stop; ++ struct mm_struct *prev_mm; ++ ++ unsigned int nr_running; ++ /* ++ * This is part of a global counter where only the total sum ++ * over all CPUs matters. A task can increase this counter on ++ * one CPU and if it got migrated afterwards it may decrease ++ * it on another CPU. Always updated under the runqueue lock: ++ */ ++ unsigned long nr_uninterruptible; ++ u64 nr_switches; ++ ++ /* Stored data about rq->curr to work outside rq lock */ ++ u64 rq_deadline; ++ int rq_prio; ++ ++ /* Best queued id for use outside lock */ ++ u64 best_key; ++ ++ unsigned long last_scheduler_tick; /* Last jiffy this RQ ticked */ ++ unsigned long last_jiffy; /* Last jiffy this RQ updated rq clock */ ++ u64 niffies; /* Last time this RQ updated rq clock */ ++ u64 last_niffy; /* Last niffies as updated by local clock */ ++ u64 last_jiffy_niffies; /* Niffies @ last_jiffy */ ++ ++ u64 load_update; /* When we last updated load */ ++ unsigned long load_avg; /* Rolling load average */ ++#ifdef CONFIG_HAVE_SCHED_AVG_IRQ ++ u64 irq_load_update; /* When we last updated IRQ load */ ++ unsigned long irq_load_avg; /* Rolling IRQ load average */ ++#endif ++#ifdef CONFIG_SMT_NICE ++ struct mm_struct *rq_mm; ++ int rq_smt_bias; /* Policy/nice level bias across smt siblings */ ++#endif ++ /* Accurate timekeeping data */ ++ unsigned long user_ns, nice_ns, irq_ns, softirq_ns, system_ns, ++ iowait_ns, idle_ns; ++ atomic_t nr_iowait; ++ ++#ifdef CONFIG_MEMBARRIER ++ int membarrier_state; ++#endif ++ ++ skiplist_node *node; ++ skiplist *sl; ++#ifdef CONFIG_SMP ++ struct task_struct *preempt; /* Preempt triggered on this task */ ++ struct task_struct *preempting; /* Hint only, what task is preempting */ ++ ++ int cpu; /* cpu of this runqueue */ ++ bool online; ++ ++ struct root_domain *rd; ++ struct sched_domain *sd; ++ ++ unsigned long cpu_capacity_orig; ++ ++ int *cpu_locality; /* CPU relative cache distance */ ++ struct rq **rq_order; /* Shared RQs ordered by relative cache distance */ ++ struct rq **cpu_order; /* RQs of discrete CPUs ordered by distance */ ++ ++ struct rq *smp_leader; /* First physical CPU per node */ ++#ifdef CONFIG_SCHED_SMT ++ struct rq *smt_leader; /* First logical CPU in SMT siblings */ ++ cpumask_t thread_mask; ++ bool (*siblings_idle)(struct rq *rq); ++ /* See if all smt siblings are idle */ ++#endif /* CONFIG_SCHED_SMT */ ++#ifdef CONFIG_SCHED_MC ++ struct rq *mc_leader; /* First logical CPU in MC siblings */ ++ cpumask_t core_mask; ++ bool (*cache_idle)(struct rq *rq); ++ /* See if all cache siblings are idle */ ++#endif /* CONFIG_SCHED_MC */ ++#endif /* CONFIG_SMP */ ++#ifdef CONFIG_IRQ_TIME_ACCOUNTING ++ u64 prev_irq_time; ++#endif /* CONFIG_IRQ_TIME_ACCOUNTING */ ++#ifdef CONFIG_PARAVIRT ++ u64 prev_steal_time; ++#endif /* CONFIG_PARAVIRT */ ++#ifdef CONFIG_PARAVIRT_TIME_ACCOUNTING ++ u64 prev_steal_time_rq; ++#endif /* CONFIG_PARAVIRT_TIME_ACCOUNTING */ ++ ++ u64 clock, old_clock, last_tick; ++ /* Ensure that all clocks are in the same cache line */ ++ u64 clock_task ____cacheline_aligned; ++ int dither; ++ ++ int iso_ticks; ++ bool iso_refractory; ++ ++#ifdef CONFIG_HIGH_RES_TIMERS ++ struct hrtimer hrexpiry_timer; ++#endif ++ ++ int rt_nr_running; /* Number real time tasks running */ ++#ifdef CONFIG_SCHEDSTATS ++ ++ /* latency stats */ ++ struct sched_info rq_sched_info; ++ unsigned long long rq_cpu_time; ++ /* could above be rq->cfs_rq.exec_clock + rq->rt_rq.rt_runtime ? */ ++ ++ /* sys_sched_yield() stats */ ++ unsigned int yld_count; ++ ++ /* schedule() stats */ ++ unsigned int sched_switch; ++ unsigned int sched_count; ++ unsigned int sched_goidle; ++ ++ /* try_to_wake_up() stats */ ++ unsigned int ttwu_count; ++ unsigned int ttwu_local; ++#endif /* CONFIG_SCHEDSTATS */ ++ ++#ifdef CONFIG_SMP ++ struct llist_head wake_list; ++#endif ++ ++#ifdef CONFIG_CPU_IDLE ++ /* Must be inspected within a rcu lock section */ ++ struct cpuidle_state *idle_state; ++#endif ++}; ++ ++struct rq_flags { ++ unsigned long flags; ++}; ++ ++#ifdef CONFIG_SMP ++struct rq *cpu_rq(int cpu); ++#endif ++ ++#ifndef CONFIG_SMP ++extern struct rq *uprq; ++#define cpu_rq(cpu) (uprq) ++#define this_rq() (uprq) ++#define raw_rq() (uprq) ++#define task_rq(p) (uprq) ++#define cpu_curr(cpu) ((uprq)->curr) ++#else /* CONFIG_SMP */ ++DECLARE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues); ++#define this_rq() this_cpu_ptr(&runqueues) ++#define raw_rq() raw_cpu_ptr(&runqueues) ++#define task_rq(p) cpu_rq(task_cpu(p)) ++#endif /* CONFIG_SMP */ ++ ++static inline int task_current(struct rq *rq, struct task_struct *p) ++{ ++ return rq->curr == p; ++} ++ ++static inline int task_running(struct rq *rq, struct task_struct *p) ++{ ++#ifdef CONFIG_SMP ++ return p->on_cpu; ++#else ++ return task_current(rq, p); ++#endif ++} ++ ++static inline int task_on_rq_queued(struct task_struct *p) ++{ ++ return p->on_rq == TASK_ON_RQ_QUEUED; ++} ++ ++static inline int task_on_rq_migrating(struct task_struct *p) ++{ ++ return READ_ONCE(p->on_rq) == TASK_ON_RQ_MIGRATING; ++} ++ ++static inline void rq_lock(struct rq *rq) ++ __acquires(rq->lock) ++{ ++ raw_spin_lock(rq->lock); ++} ++ ++static inline void rq_unlock(struct rq *rq) ++ __releases(rq->lock) ++{ ++ raw_spin_unlock(rq->lock); ++} ++ ++static inline void rq_lock_irq(struct rq *rq) ++ __acquires(rq->lock) ++{ ++ raw_spin_lock_irq(rq->lock); ++} ++ ++static inline void rq_unlock_irq(struct rq *rq, struct rq_flags __always_unused *rf) ++ __releases(rq->lock) ++{ ++ raw_spin_unlock_irq(rq->lock); ++} ++ ++static inline void rq_lock_irqsave(struct rq *rq, struct rq_flags *rf) ++ __acquires(rq->lock) ++{ ++ raw_spin_lock_irqsave(rq->lock, rf->flags); ++} ++ ++static inline void rq_unlock_irqrestore(struct rq *rq, struct rq_flags *rf) ++ __releases(rq->lock) ++{ ++ raw_spin_unlock_irqrestore(rq->lock, rf->flags); ++} ++ ++static inline struct rq *task_rq_lock(struct task_struct *p, struct rq_flags *rf) ++ __acquires(p->pi_lock) ++ __acquires(rq->lock) ++{ ++ struct rq *rq; ++ ++ while (42) { ++ raw_spin_lock_irqsave(&p->pi_lock, rf->flags); ++ rq = task_rq(p); ++ raw_spin_lock(rq->lock); ++ if (likely(rq == task_rq(p))) ++ break; ++ raw_spin_unlock(rq->lock); ++ raw_spin_unlock_irqrestore(&p->pi_lock, rf->flags); ++ } ++ return rq; ++} ++ ++static inline void task_rq_unlock(struct rq *rq, struct task_struct *p, struct rq_flags *rf) ++ __releases(rq->lock) ++ __releases(p->pi_lock) ++{ ++ rq_unlock(rq); ++ raw_spin_unlock_irqrestore(&p->pi_lock, rf->flags); ++} ++ ++static inline struct rq *__task_rq_lock(struct task_struct *p, struct rq_flags __always_unused *rf) ++ __acquires(rq->lock) ++{ ++ struct rq *rq; ++ ++ lockdep_assert_held(&p->pi_lock); ++ ++ while (42) { ++ rq = task_rq(p); ++ raw_spin_lock(rq->lock); ++ if (likely(rq == task_rq(p))) ++ break; ++ raw_spin_unlock(rq->lock); ++ } ++ return rq; ++} ++ ++static inline void __task_rq_unlock(struct rq *rq, struct rq_flags __always_unused *rf) ++{ ++ rq_unlock(rq); ++} ++ ++static inline struct rq * ++this_rq_lock_irq(struct rq_flags *rf) ++ __acquires(rq->lock) ++{ ++ struct rq *rq; ++ ++ local_irq_disable(); ++ rq = this_rq(); ++ rq_lock(rq); ++ return rq; ++} ++ ++/* ++ * {de,en}queue flags: Most not used on MuQSS. ++ * ++ * DEQUEUE_SLEEP - task is no longer runnable ++ * ENQUEUE_WAKEUP - task just became runnable ++ * ++ * SAVE/RESTORE - an otherwise spurious dequeue/enqueue, done to ensure tasks ++ * are in a known state which allows modification. Such pairs ++ * should preserve as much state as possible. ++ * ++ * MOVE - paired with SAVE/RESTORE, explicitly does not preserve the location ++ * in the runqueue. ++ * ++ * ENQUEUE_HEAD - place at front of runqueue (tail if not specified) ++ * ENQUEUE_REPLENISH - CBS (replenish runtime and postpone deadline) ++ * ENQUEUE_MIGRATED - the task was migrated during wakeup ++ * ++ */ ++ ++#define DEQUEUE_SLEEP 0x01 ++#define DEQUEUE_SAVE 0x02 /* matches ENQUEUE_RESTORE */ ++ ++#define ENQUEUE_WAKEUP 0x01 ++#define ENQUEUE_RESTORE 0x02 ++ ++#ifdef CONFIG_SMP ++#define ENQUEUE_MIGRATED 0x40 ++#else ++#define ENQUEUE_MIGRATED 0x00 ++#endif ++ ++static inline u64 __rq_clock_broken(struct rq *rq) ++{ ++ return READ_ONCE(rq->clock); ++} ++ ++static inline u64 rq_clock(struct rq *rq) ++{ ++ lockdep_assert_held(rq->lock); ++ ++ return rq->clock; ++} ++ ++static inline u64 rq_clock_task(struct rq *rq) ++{ ++ lockdep_assert_held(rq->lock); ++ ++ return rq->clock_task; ++} ++ ++#ifdef CONFIG_NUMA ++enum numa_topology_type { ++ NUMA_DIRECT, ++ NUMA_GLUELESS_MESH, ++ NUMA_BACKPLANE, ++}; ++extern enum numa_topology_type sched_numa_topology_type; ++extern int sched_max_numa_distance; ++extern bool find_numa_distance(int distance); ++extern void sched_init_numa(void); ++extern void sched_domains_numa_masks_set(unsigned int cpu); ++extern void sched_domains_numa_masks_clear(unsigned int cpu); ++extern int sched_numa_find_closest(const struct cpumask *cpus, int cpu); ++#else ++static inline void sched_init_numa(void) { } ++static inline void sched_domains_numa_masks_set(unsigned int cpu) { } ++static inline void sched_domains_numa_masks_clear(unsigned int cpu) { } ++static inline int sched_numa_find_closest(const struct cpumask *cpus, int cpu) ++{ ++ return nr_cpu_ids; ++} ++#endif ++ ++extern struct mutex sched_domains_mutex; ++extern struct static_key_false sched_schedstats; ++ ++#define rcu_dereference_check_sched_domain(p) \ ++ rcu_dereference_check((p), \ ++ lockdep_is_held(&sched_domains_mutex)) ++ ++#ifdef CONFIG_SMP ++ ++/* ++ * The domain tree (rq->sd) is protected by RCU's quiescent state transition. ++ * See destroy_sched_domains: call_rcu for details. ++ * ++ * The domain tree of any CPU may only be accessed from within ++ * preempt-disabled sections. ++ */ ++#define for_each_domain(cpu, __sd) \ ++ for (__sd = rcu_dereference_check_sched_domain(cpu_rq(cpu)->sd); \ ++ __sd; __sd = __sd->parent) ++ ++#define for_each_lower_domain(sd) for (; sd; sd = sd->child) ++ ++/** ++ * highest_flag_domain - Return highest sched_domain containing flag. ++ * @cpu: The cpu whose highest level of sched domain is to ++ * be returned. ++ * @flag: The flag to check for the highest sched_domain ++ * for the given cpu. ++ * ++ * Returns the highest sched_domain of a cpu which contains the given flag. ++ */ ++static inline struct sched_domain *highest_flag_domain(int cpu, int flag) ++{ ++ struct sched_domain *sd, *hsd = NULL; ++ ++ for_each_domain(cpu, sd) { ++ if (!(sd->flags & flag)) ++ break; ++ hsd = sd; ++ } ++ ++ return hsd; ++} ++ ++static inline struct sched_domain *lowest_flag_domain(int cpu, int flag) ++{ ++ struct sched_domain *sd; ++ ++ for_each_domain(cpu, sd) { ++ if (sd->flags & flag) ++ break; ++ } ++ ++ return sd; ++} ++ ++DECLARE_PER_CPU(struct sched_domain *, sd_llc); ++DECLARE_PER_CPU(int, sd_llc_size); ++DECLARE_PER_CPU(int, sd_llc_id); ++DECLARE_PER_CPU(struct sched_domain_shared *, sd_llc_shared); ++DECLARE_PER_CPU(struct sched_domain *, sd_numa); ++DECLARE_PER_CPU(struct sched_domain *, sd_asym_packing); ++DECLARE_PER_CPU(struct sched_domain *, sd_asym_cpucapacity); ++ ++struct sched_group_capacity { ++ atomic_t ref; ++ /* ++ * CPU capacity of this group, SCHED_CAPACITY_SCALE being max capacity ++ * for a single CPU. ++ */ ++ unsigned long capacity; ++ unsigned long min_capacity; /* Min per-CPU capacity in group */ ++ unsigned long max_capacity; /* Max per-CPU capacity in group */ ++ unsigned long next_update; ++ int imbalance; /* XXX unrelated to capacity but shared group state */ ++ ++#ifdef CONFIG_SCHED_DEBUG ++ int id; ++#endif ++ ++ unsigned long cpumask[0]; /* balance mask */ ++}; ++ ++struct sched_group { ++ struct sched_group *next; /* Must be a circular list */ ++ atomic_t ref; ++ ++ unsigned int group_weight; ++ struct sched_group_capacity *sgc; ++ int asym_prefer_cpu; /* cpu of highest priority in group */ ++ ++ /* ++ * The CPUs this group covers. ++ * ++ * NOTE: this field is variable length. (Allocated dynamically ++ * by attaching extra space to the end of the structure, ++ * depending on how many CPUs the kernel has booted up with) ++ */ ++ unsigned long cpumask[0]; ++}; ++ ++static inline struct cpumask *sched_group_span(struct sched_group *sg) ++{ ++ return to_cpumask(sg->cpumask); ++} ++ ++/* ++ * See build_balance_mask(). ++ */ ++static inline struct cpumask *group_balance_mask(struct sched_group *sg) ++{ ++ return to_cpumask(sg->sgc->cpumask); ++} ++ ++/** ++ * group_first_cpu - Returns the first cpu in the cpumask of a sched_group. ++ * @group: The group whose first cpu is to be returned. ++ */ ++static inline unsigned int group_first_cpu(struct sched_group *group) ++{ ++ return cpumask_first(sched_group_span(group)); ++} ++ ++ ++#if defined(CONFIG_SCHED_DEBUG) && defined(CONFIG_SYSCTL) ++void register_sched_domain_sysctl(void); ++void dirty_sched_domain_sysctl(int cpu); ++void unregister_sched_domain_sysctl(void); ++#else ++static inline void register_sched_domain_sysctl(void) ++{ ++} ++static inline void dirty_sched_domain_sysctl(int cpu) ++{ ++} ++static inline void unregister_sched_domain_sysctl(void) ++{ ++} ++#endif ++ ++extern void sched_ttwu_pending(void); ++extern void set_cpus_allowed_common(struct task_struct *p, const struct cpumask *new_mask); ++extern void set_rq_online (struct rq *rq); ++extern void set_rq_offline(struct rq *rq); ++extern bool sched_smp_initialized; ++ ++static inline void update_group_capacity(struct sched_domain *sd, int cpu) ++{ ++} ++ ++static inline void trigger_load_balance(struct rq *rq) ++{ ++} ++ ++#define sched_feat(x) 0 ++ ++#else /* CONFIG_SMP */ ++ ++static inline void sched_ttwu_pending(void) { } ++ ++#endif /* CONFIG_SMP */ ++ ++#ifdef CONFIG_CPU_IDLE ++static inline void idle_set_state(struct rq *rq, ++ struct cpuidle_state *idle_state) ++{ ++ rq->idle_state = idle_state; ++} ++ ++static inline struct cpuidle_state *idle_get_state(struct rq *rq) ++{ ++ SCHED_WARN_ON(!rcu_read_lock_held()); ++ return rq->idle_state; ++} ++#else ++static inline void idle_set_state(struct rq *rq, ++ struct cpuidle_state *idle_state) ++{ ++} ++ ++static inline struct cpuidle_state *idle_get_state(struct rq *rq) ++{ ++ return NULL; ++} ++#endif ++ ++#ifdef CONFIG_SCHED_DEBUG ++extern bool sched_debug_enabled; ++#endif ++ ++extern void schedule_idle(void); ++ ++#ifdef CONFIG_IRQ_TIME_ACCOUNTING ++struct irqtime { ++ u64 total; ++ u64 tick_delta; ++ u64 irq_start_time; ++ struct u64_stats_sync sync; ++}; ++ ++DECLARE_PER_CPU(struct irqtime, cpu_irqtime); ++ ++/* ++ * Returns the irqtime minus the softirq time computed by ksoftirqd. ++ * Otherwise ksoftirqd's sum_exec_runtime is substracted its own runtime ++ * and never move forward. ++ */ ++static inline u64 irq_time_read(int cpu) ++{ ++ struct irqtime *irqtime = &per_cpu(cpu_irqtime, cpu); ++ unsigned int seq; ++ u64 total; ++ ++ do { ++ seq = __u64_stats_fetch_begin(&irqtime->sync); ++ total = irqtime->total; ++ } while (__u64_stats_fetch_retry(&irqtime->sync, seq)); ++ ++ return total; ++} ++#endif /* CONFIG_IRQ_TIME_ACCOUNTING */ ++ ++static inline bool sched_stop_runnable(struct rq *rq) ++{ ++ return rq->stop && task_on_rq_queued(rq->stop); ++} ++ ++#ifdef CONFIG_SMP ++static inline int cpu_of(struct rq *rq) ++{ ++ return rq->cpu; ++} ++#else /* CONFIG_SMP */ ++static inline int cpu_of(struct rq *rq) ++{ ++ return 0; ++} ++#endif ++ ++#ifdef CONFIG_CPU_FREQ ++DECLARE_PER_CPU(struct update_util_data *, cpufreq_update_util_data); ++ ++static inline void cpufreq_trigger(struct rq *rq, unsigned int flags) ++{ ++ struct update_util_data *data; ++ ++ data = rcu_dereference_sched(*per_cpu_ptr(&cpufreq_update_util_data, ++ cpu_of(rq))); ++ ++ if (data) ++ data->func(data, rq->niffies, flags); ++} ++#else ++static inline void cpufreq_trigger(struct rq *rq, unsigned int flag) ++{ ++} ++#endif /* CONFIG_CPU_FREQ */ ++ ++static __always_inline ++unsigned int uclamp_rq_util_with(struct rq __maybe_unused *rq, unsigned int util, ++ struct task_struct __maybe_unused *p) ++{ ++ return util; ++} ++ ++static inline unsigned int uclamp_util(struct rq *rq, unsigned int util) ++{ ++ return util; ++} ++ ++#ifdef arch_scale_freq_capacity ++#ifndef arch_scale_freq_invariant ++#define arch_scale_freq_invariant() (true) ++#endif ++#else /* arch_scale_freq_capacity */ ++#define arch_scale_freq_invariant() (false) ++#endif ++ ++/* ++ * This should only be called when current == rq->idle. Dodgy workaround for ++ * when softirqs are pending and we are in the idle loop. Setting current to ++ * resched will kick us out of the idle loop and the softirqs will be serviced ++ * on our next pass through schedule(). ++ */ ++static inline bool softirq_pending(int cpu) ++{ ++ if (likely(!local_softirq_pending())) ++ return false; ++ set_tsk_need_resched(current); ++ return true; ++} ++ ++#ifdef CONFIG_64BIT ++static inline u64 read_sum_exec_runtime(struct task_struct *t) ++{ ++ return tsk_seruntime(t); ++} ++#else ++static inline u64 read_sum_exec_runtime(struct task_struct *t) ++{ ++ struct rq_flags rf; ++ u64 ns; ++ struct rq *rq; ++ ++ rq = task_rq_lock(t, &rf); ++ ns = tsk_seruntime(t); ++ task_rq_unlock(rq, t, &rf); ++ ++ return ns; ++} ++#endif ++ ++#ifndef arch_scale_freq_capacity ++static __always_inline ++unsigned long arch_scale_freq_capacity(int cpu) ++{ ++ return SCHED_CAPACITY_SCALE; ++} ++#endif ++ ++#ifdef CONFIG_NO_HZ_FULL ++extern bool sched_can_stop_tick(struct rq *rq); ++extern int __init sched_tick_offload_init(void); ++ ++/* ++ * Tick may be needed by tasks in the runqueue depending on their policy and ++ * requirements. If tick is needed, lets send the target an IPI to kick it out of ++ * nohz mode if necessary. ++ */ ++static inline void sched_update_tick_dependency(struct rq *rq) ++{ ++ int cpu; ++ ++ if (!tick_nohz_full_enabled()) ++ return; ++ ++ cpu = cpu_of(rq); ++ ++ if (!tick_nohz_full_cpu(cpu)) ++ return; ++ ++ if (sched_can_stop_tick(rq)) ++ tick_nohz_dep_clear_cpu(cpu, TICK_DEP_BIT_SCHED); ++ else ++ tick_nohz_dep_set_cpu(cpu, TICK_DEP_BIT_SCHED); ++} ++#else ++static inline int sched_tick_offload_init(void) { return 0; } ++static inline void sched_update_tick_dependency(struct rq *rq) { } ++#endif ++ ++#define SCHED_FLAG_SUGOV 0x10000000 ++ ++static inline bool rt_rq_is_runnable(struct rq *rt_rq) ++{ ++ return rt_rq->rt_nr_running; ++} ++ ++/** ++ * enum schedutil_type - CPU utilization type ++ * @FREQUENCY_UTIL: Utilization used to select frequency ++ * @ENERGY_UTIL: Utilization used during energy calculation ++ * ++ * The utilization signals of all scheduling classes (CFS/RT/DL) and IRQ time ++ * need to be aggregated differently depending on the usage made of them. This ++ * enum is used within schedutil_freq_util() to differentiate the types of ++ * utilization expected by the callers, and adjust the aggregation accordingly. ++ */ ++enum schedutil_type { ++ FREQUENCY_UTIL, ++ ENERGY_UTIL, ++}; ++ ++#ifdef CONFIG_CPU_FREQ_GOV_SCHEDUTIL ++ ++unsigned long schedutil_cpu_util(int cpu, unsigned long util_cfs, ++ unsigned long max, enum schedutil_type type, ++ struct task_struct *p); ++ ++static inline unsigned long cpu_bw_dl(struct rq *rq) ++{ ++ return 0; ++} ++ ++static inline unsigned long cpu_util_dl(struct rq *rq) ++{ ++ return 0; ++} ++ ++static inline unsigned long cpu_util_cfs(struct rq *rq) ++{ ++ unsigned long ret = READ_ONCE(rq->load_avg); ++ ++ if (ret > SCHED_CAPACITY_SCALE) ++ ret = SCHED_CAPACITY_SCALE; ++ return ret; ++} ++ ++static inline unsigned long cpu_util_rt(struct rq *rq) ++{ ++ unsigned long ret = READ_ONCE(rq->rt_nr_running); ++ ++ if (ret > SCHED_CAPACITY_SCALE) ++ ret = SCHED_CAPACITY_SCALE; ++ return ret; ++} ++ ++#ifdef CONFIG_HAVE_SCHED_AVG_IRQ ++static inline unsigned long cpu_util_irq(struct rq *rq) ++{ ++ unsigned long ret = READ_ONCE(rq->irq_load_avg); ++ ++ if (ret > SCHED_CAPACITY_SCALE) ++ ret = SCHED_CAPACITY_SCALE; ++ return ret; ++} ++ ++static inline ++unsigned long scale_irq_capacity(unsigned long util, unsigned long irq, unsigned long max) ++{ ++ util *= (max - irq); ++ util /= max; ++ ++ return util; ++ ++} ++#else ++static inline unsigned long cpu_util_irq(struct rq *rq) ++{ ++ return 0; ++} ++ ++static inline ++unsigned long scale_irq_capacity(unsigned long util, unsigned long irq, unsigned long max) ++{ ++ return util; ++} ++#endif ++#endif ++ ++#if defined(CONFIG_ENERGY_MODEL) && defined(CONFIG_CPU_FREQ_GOV_SCHEDUTIL) ++#define perf_domain_span(pd) (to_cpumask(((pd)->em_pd->cpus))) ++ ++DECLARE_STATIC_KEY_FALSE(sched_energy_present); ++ ++static inline bool sched_energy_enabled(void) ++{ ++ return static_branch_unlikely(&sched_energy_present); ++} ++ ++#else /* ! (CONFIG_ENERGY_MODEL && CONFIG_CPU_FREQ_GOV_SCHEDUTIL) */ ++ ++#define perf_domain_span(pd) NULL ++static inline bool sched_energy_enabled(void) { return false; } ++ ++#endif /* CONFIG_ENERGY_MODEL && CONFIG_CPU_FREQ_GOV_SCHEDUTIL */ ++ ++#ifdef CONFIG_MEMBARRIER ++/* ++ * The scheduler provides memory barriers required by membarrier between: ++ * - prior user-space memory accesses and store to rq->membarrier_state, ++ * - store to rq->membarrier_state and following user-space memory accesses. ++ * In the same way it provides those guarantees around store to rq->curr. ++ */ ++static inline void membarrier_switch_mm(struct rq *rq, ++ struct mm_struct *prev_mm, ++ struct mm_struct *next_mm) ++{ ++ int membarrier_state; ++ ++ if (prev_mm == next_mm) ++ return; ++ ++ membarrier_state = atomic_read(&next_mm->membarrier_state); ++ if (READ_ONCE(rq->membarrier_state) == membarrier_state) ++ return; ++ ++ WRITE_ONCE(rq->membarrier_state, membarrier_state); ++} ++#else ++static inline void membarrier_switch_mm(struct rq *rq, ++ struct mm_struct *prev_mm, ++ struct mm_struct *next_mm) ++{ ++} ++#endif ++ ++#endif /* MUQSS_SCHED_H */ +diff --git a/kernel/sched/cpufreq_schedutil.c b/kernel/sched/cpufreq_schedutil.c +index 9b8916fd00a2..5769f533f6d2 100644 +--- a/kernel/sched/cpufreq_schedutil.c ++++ b/kernel/sched/cpufreq_schedutil.c +@@ -183,6 +183,12 @@ static unsigned int get_next_freq(struct sugov_policy *sg_policy, + return cpufreq_driver_resolve_freq(policy, freq); + } + ++#ifdef CONFIG_SCHED_MUQSS ++#define rt_rq_runnable(rq_rt) rt_rq_is_runnable(rq) ++#else ++#define rt_rq_runnable(rq_rt) rt_rq_is_runnable(&rq->rt) ++#endif ++ + /* + * This function computes an effective utilization for the given CPU, to be + * used for frequency selection given the linear relation: f = u * f_max. +@@ -211,7 +217,7 @@ unsigned long schedutil_cpu_util(int cpu, unsigned long util_cfs, + struct rq *rq = cpu_rq(cpu); + + if (!IS_BUILTIN(CONFIG_UCLAMP_TASK) && +- type == FREQUENCY_UTIL && rt_rq_is_runnable(&rq->rt)) { ++ type == FREQUENCY_UTIL && rt_rq_runnable(rq)) { + return max; + } + +@@ -656,7 +662,11 @@ static int sugov_kthread_create(struct sugov_policy *sg_policy) + struct task_struct *thread; + struct sched_attr attr = { + .size = sizeof(struct sched_attr), ++#ifdef CONFIG_SCHED_MUQSS ++ .sched_policy = SCHED_RR, ++#else + .sched_policy = SCHED_DEADLINE, ++#endif + .sched_flags = SCHED_FLAG_SUGOV, + .sched_nice = 0, + .sched_priority = 0, +diff --git a/kernel/sched/cpupri.h b/kernel/sched/cpupri.h +index 7dc20a3232e7..e733a0a53b0a 100644 +--- a/kernel/sched/cpupri.h ++++ b/kernel/sched/cpupri.h +@@ -17,11 +17,13 @@ struct cpupri { + int *cpu_to_pri; + }; + ++#ifndef CONFIG_SCHED_MUQSS + #ifdef CONFIG_SMP + int cpupri_find(struct cpupri *cp, struct task_struct *p, + struct cpumask *lowest_mask, + bool (*fitness_fn)(struct task_struct *p, int cpu)); + void cpupri_set(struct cpupri *cp, int cpu, int pri); + int cpupri_init(struct cpupri *cp); + void cpupri_cleanup(struct cpupri *cp); + #endif ++#endif +diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c +index d43318a489f2..16b0866ac3b6 100644 +--- a/kernel/sched/cputime.c ++++ b/kernel/sched/cputime.c +@@ -266,26 +266,6 @@ static inline u64 account_other_time(u64 max) + return accounted; + } + +-#ifdef CONFIG_64BIT +-static inline u64 read_sum_exec_runtime(struct task_struct *t) +-{ +- return t->se.sum_exec_runtime; +-} +-#else +-static u64 read_sum_exec_runtime(struct task_struct *t) +-{ +- u64 ns; +- struct rq_flags rf; +- struct rq *rq; +- +- rq = task_rq_lock(t, &rf); +- ns = t->se.sum_exec_runtime; +- task_rq_unlock(rq, t, &rf); +- +- return ns; +-} +-#endif +- + /* + * Accumulate raw cputime values of dead tasks (sig->[us]time) and live + * tasks (sum on group iteration) belonging to @tsk's group. +@@ -661,7 +641,7 @@ void cputime_adjust(struct task_cputime *curr, struct prev_cputime *prev, + void task_cputime_adjusted(struct task_struct *p, u64 *ut, u64 *st) + { + struct task_cputime cputime = { +- .sum_exec_runtime = p->se.sum_exec_runtime, ++ .sum_exec_runtime = tsk_seruntime(p), + }; + + task_cputime(p, &cputime.utime, &cputime.stime); +diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c +index ffa959e91227..ad4d17a51c0b 100644 +--- a/kernel/sched/idle.c ++++ b/kernel/sched/idle.c +@@ -231,6 +231,8 @@ static void cpuidle_idle_call(void) + static void do_idle(void) + { + int cpu = smp_processor_id(); ++ bool pending = false; ++ + /* + * If the arch has a polling bit, we maintain an invariant: + * +@@ -241,7 +243,10 @@ static void do_idle(void) + */ + + __current_set_polling(); +- tick_nohz_idle_enter(); ++ if (unlikely(softirq_pending(cpu))) ++ pending = true; ++ else ++ tick_nohz_idle_enter(); + + while (!need_resched()) { + rmb(); +@@ -279,7 +284,8 @@ static void do_idle(void) + * an IPI to fold the state for us. + */ + preempt_set_need_resched(); +- tick_nohz_idle_exit(); ++ if (!pending) ++ tick_nohz_idle_exit(); + __current_clr_polling(); + + /* +@@ -361,6 +367,7 @@ void cpu_startup_entry(enum cpuhp_state state) + do_idle(); + } + ++#ifndef CONFIG_SCHED_MUQSS + /* + * idle-task scheduling class. + */ +@@ -481,3 +488,4 @@ const struct sched_class idle_sched_class = { + .switched_to = switched_to_idle, + .update_curr = update_curr_idle, + }; ++#endif /* CONFIG_SCHED_MUQSS */ +diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h +index 280a3c735935..70e7f77f7691 100644 +--- a/kernel/sched/sched.h ++++ b/kernel/sched/sched.h +@@ -2,6 +2,19 @@ + /* + * Scheduler internal types and methods: + */ ++#ifdef CONFIG_SCHED_MUQSS ++#include "MuQSS.h" ++ ++/* Begin compatibility wrappers for MuQSS/CFS differences */ ++#define rq_rt_nr_running(rq) ((rq)->rt_nr_running) ++#define rq_h_nr_running(rq) ((rq)->nr_running) ++ ++#else /* CONFIG_SCHED_MUQSS */ ++ ++#define rq_rt_nr_running(rq) ((rq)->rt.rt_nr_running) ++#define rq_h_nr_running(rq) ((rq)->cfs.h_nr_running) ++ ++ + #include + + #include +@@ -2487,3 +2500,30 @@ static inline void membarrier_switch_mm(struct rq *rq, + { + } + #endif ++ ++/* MuQSS compatibility functions */ ++static inline bool softirq_pending(int cpu) ++{ ++ return false; ++} ++ ++#ifdef CONFIG_64BIT ++static inline u64 read_sum_exec_runtime(struct task_struct *t) ++{ ++ return t->se.sum_exec_runtime; ++} ++#else ++static inline u64 read_sum_exec_runtime(struct task_struct *t) ++{ ++ u64 ns; ++ struct rq_flags rf; ++ struct rq *rq; ++ ++ rq = task_rq_lock(t, &rf); ++ ns = t->se.sum_exec_runtime; ++ task_rq_unlock(rq, t, &rf); ++ ++ return ns; ++} ++#endif ++#endif /* CONFIG_SCHED_MUQSS */ +diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c +index 6ec1e595b1d4..7261ad3ec264 100644 +--- a/kernel/sched/topology.c ++++ b/kernel/sched/topology.c +@@ -3,6 +3,7 @@ + * Scheduler topology setup/handling methods + */ + #include "sched.h" ++#include "linux/sched/deadline.h" + + DEFINE_MUTEX(sched_domains_mutex); + +@@ -442,7 +443,11 @@ void rq_attach_root(struct rq *rq, struct root_domain *rd) + struct root_domain *old_rd = NULL; + unsigned long flags; + ++#ifdef CONFIG_SCHED_MUQSS ++ raw_spin_lock_irqsave(rq->lock, flags); ++#else + raw_spin_lock_irqsave(&rq->lock, flags); ++#endif + + if (rq->rd) { + old_rd = rq->rd; +@@ -468,7 +473,11 @@ void rq_attach_root(struct rq *rq, struct root_domain *rd) + if (cpumask_test_cpu(rq->cpu, cpu_active_mask)) + set_rq_online(rq); + ++#ifdef CONFIG_SCHED_MUQSS ++ raw_spin_unlock_irqrestore(rq->lock, flags); ++#else + raw_spin_unlock_irqrestore(&rq->lock, flags); ++#endif + + if (old_rd) + call_rcu(&old_rd->rcu, free_rootdomain); +diff --git a/kernel/skip_list.c b/kernel/skip_list.c +new file mode 100644 +index 000000000000..bf5c6e97e139 +--- /dev/null ++++ b/kernel/skip_list.c +@@ -0,0 +1,148 @@ ++/* ++ Copyright (C) 2011,2016 Con Kolivas. ++ ++ Code based on example originally by William Pugh. ++ ++Skip Lists are a probabilistic alternative to balanced trees, as ++described in the June 1990 issue of CACM and were invented by ++William Pugh in 1987. ++ ++A couple of comments about this implementation: ++The routine randomLevel has been hard-coded to generate random ++levels using p=0.25. It can be easily changed. ++ ++The insertion routine has been implemented so as to use the ++dirty hack described in the CACM paper: if a random level is ++generated that is more than the current maximum level, the ++current maximum level plus one is used instead. ++ ++Levels start at zero and go up to MaxLevel (which is equal to ++MaxNumberOfLevels-1). ++ ++The routines defined in this file are: ++ ++init: defines slnode ++ ++new_skiplist: returns a new, empty list ++ ++randomLevel: Returns a random level based on a u64 random seed passed to it. ++In MuQSS, the "niffy" time is used for this purpose. ++ ++insert(l,key, value): inserts the binding (key, value) into l. This operation ++occurs in O(log n) time. ++ ++delnode(slnode, l, node): deletes any binding of key from the l based on the ++actual node value. This operation occurs in O(k) time where k is the ++number of levels of the node in question (max 8). The original delete ++function occurred in O(log n) time and involved a search. ++ ++MuQSS Notes: In this implementation of skiplists, there are bidirectional ++next/prev pointers and the insert function returns a pointer to the actual ++node the value is stored. The key here is chosen by the scheduler so as to ++sort tasks according to the priority list requirements and is no longer used ++by the scheduler after insertion. The scheduler lookup, however, occurs in ++O(1) time because it is always the first item in the level 0 linked list. ++Since the task struct stores a copy of the node pointer upon skiplist_insert, ++it can also remove it much faster than the original implementation with the ++aid of prev<->next pointer manipulation and no searching. ++ ++*/ ++ ++#include ++#include ++ ++#define MaxNumberOfLevels 8 ++#define MaxLevel (MaxNumberOfLevels - 1) ++ ++void skiplist_init(skiplist_node *slnode) ++{ ++ int i; ++ ++ slnode->key = 0xFFFFFFFFFFFFFFFF; ++ slnode->level = 0; ++ slnode->value = NULL; ++ for (i = 0; i < MaxNumberOfLevels; i++) ++ slnode->next[i] = slnode->prev[i] = slnode; ++} ++ ++skiplist *new_skiplist(skiplist_node *slnode) ++{ ++ skiplist *l = kzalloc(sizeof(skiplist), GFP_ATOMIC); ++ ++ BUG_ON(!l); ++ l->header = slnode; ++ return l; ++} ++ ++void free_skiplist(skiplist *l) ++{ ++ skiplist_node *p, *q; ++ ++ p = l->header; ++ do { ++ q = p->next[0]; ++ p->next[0]->prev[0] = q->prev[0]; ++ skiplist_node_init(p); ++ p = q; ++ } while (p != l->header); ++ kfree(l); ++} ++ ++void skiplist_node_init(skiplist_node *node) ++{ ++ memset(node, 0, sizeof(skiplist_node)); ++} ++ ++static inline unsigned int randomLevel(const long unsigned int randseed) ++{ ++ return find_first_bit(&randseed, MaxLevel) / 2; ++} ++ ++void skiplist_insert(skiplist *l, skiplist_node *node, keyType key, valueType value, unsigned int randseed) ++{ ++ skiplist_node *update[MaxNumberOfLevels]; ++ skiplist_node *p, *q; ++ int k = l->level; ++ ++ p = l->header; ++ do { ++ while (q = p->next[k], q->key <= key) ++ p = q; ++ update[k] = p; ++ } while (--k >= 0); ++ ++ ++l->entries; ++ k = randomLevel(randseed); ++ if (k > l->level) { ++ k = ++l->level; ++ update[k] = l->header; ++ } ++ ++ node->level = k; ++ node->key = key; ++ node->value = value; ++ do { ++ p = update[k]; ++ node->next[k] = p->next[k]; ++ p->next[k] = node; ++ node->prev[k] = p; ++ node->next[k]->prev[k] = node; ++ } while (--k >= 0); ++} ++ ++void skiplist_delete(skiplist *l, skiplist_node *node) ++{ ++ int k, m = node->level; ++ ++ for (k = 0; k <= m; k++) { ++ node->prev[k]->next[k] = node->next[k]; ++ node->next[k]->prev[k] = node->prev[k]; ++ } ++ skiplist_node_init(node); ++ if (m == l->level) { ++ while (l->header->next[m] == l->header && l->header->prev[m] == l->header && m > 0) ++ m--; ++ l->level = m; ++ } ++ l->entries--; ++} +diff --git a/kernel/sysctl.c b/kernel/sysctl.c +index 70665934d53e..0f9e94cb12aa 100644 +--- a/kernel/sysctl.c ++++ b/kernel/sysctl.c +@@ -130,9 +130,19 @@ static int __maybe_unused four = 4; + static unsigned long zero_ul; + static unsigned long one_ul = 1; + static unsigned long long_max = LONG_MAX; +-static int one_hundred = 100; +-static int one_thousand = 1000; +-#ifdef CONFIG_PRINTK ++static int __read_mostly one_hundred = 100; ++static int __read_mostly one_thousand = 1000; ++static int zero = 0; ++static int one = 1; ++#ifdef CONFIG_SCHED_MUQSS ++extern int rr_interval; ++extern int sched_interactive; ++extern int sched_iso_cpu; ++extern int sched_yield_type; ++#endif ++extern int hrtimer_granularity_us; ++extern int hrtimeout_min_us; ++#if defined(CONFIG_PRINTK) || defined(CONFIG_SCHED_MUQSS) + static int ten_thousand = 10000; + #endif + #ifdef CONFIG_PERF_EVENTS +@@ -300,7 +310,7 @@ static struct ctl_table sysctl_base_table[] = { + { } + }; + +-#ifdef CONFIG_SCHED_DEBUG ++#if defined(CONFIG_SCHED_DEBUG) && !defined(CONFIG_SCHED_MUQSS) + static int min_sched_granularity_ns = 100000; /* 100 usecs */ + static int max_sched_granularity_ns = NSEC_PER_SEC; /* 1 second */ + static int min_wakeup_granularity_ns; /* 0 usecs */ +@@ -317,6 +327,7 @@ static int max_extfrag_threshold = 1000; + #endif + + static struct ctl_table kern_table[] = { ++#ifndef CONFIG_SCHED_MUQSS + { + .procname = "sched_child_runs_first", + .data = &sysctl_sched_child_runs_first, +@@ -498,6 +509,7 @@ static struct ctl_table kern_table[] = { + .extra2 = SYSCTL_ONE, + }, + #endif ++#endif /* !CONFIG_SCHED_MUQSS */ + #ifdef CONFIG_PROVE_LOCKING + { + .procname = "prove_locking", +@@ -1070,6 +1082,62 @@ static struct ctl_table kern_table[] = { + .proc_handler = proc_dointvec, + }, + #endif ++#ifdef CONFIG_SCHED_MUQSS ++ { ++ .procname = "rr_interval", ++ .data = &rr_interval, ++ .maxlen = sizeof (int), ++ .mode = 0644, ++ .proc_handler = &proc_dointvec_minmax, ++ .extra1 = &one, ++ .extra2 = &one_thousand, ++ }, ++ { ++ .procname = "interactive", ++ .data = &sched_interactive, ++ .maxlen = sizeof(int), ++ .mode = 0644, ++ .proc_handler = &proc_dointvec_minmax, ++ .extra1 = &zero, ++ .extra2 = &one, ++ }, ++ { ++ .procname = "iso_cpu", ++ .data = &sched_iso_cpu, ++ .maxlen = sizeof (int), ++ .mode = 0644, ++ .proc_handler = &proc_dointvec_minmax, ++ .extra1 = &zero, ++ .extra2 = &one_hundred, ++ }, ++ { ++ .procname = "yield_type", ++ .data = &sched_yield_type, ++ .maxlen = sizeof (int), ++ .mode = 0644, ++ .proc_handler = &proc_dointvec_minmax, ++ .extra1 = &zero, ++ .extra2 = &two, ++ }, ++#endif ++ { ++ .procname = "hrtimer_granularity_us", ++ .data = &hrtimer_granularity_us, ++ .maxlen = sizeof(int), ++ .mode = 0644, ++ .proc_handler = &proc_dointvec_minmax, ++ .extra1 = &one, ++ .extra2 = &ten_thousand, ++ }, ++ { ++ .procname = "hrtimeout_min_us", ++ .data = &hrtimeout_min_us, ++ .maxlen = sizeof(int), ++ .mode = 0644, ++ .proc_handler = &proc_dointvec_minmax, ++ .extra1 = &one, ++ .extra2 = &ten_thousand, ++ }, + #if defined(CONFIG_S390) && defined(CONFIG_SMP) + { + .procname = "spin_retry", +diff --git a/kernel/time/Kconfig b/kernel/time/Kconfig +index fcc42353f125..46bb16d3c159 100644 +--- a/kernel/time/Kconfig ++++ b/kernel/time/Kconfig +@@ -66,6 +66,9 @@ config NO_HZ_COMMON + depends on !ARCH_USES_GETTIMEOFFSET && GENERIC_CLOCKEVENTS + select TICK_ONESHOT + ++config NO_HZ_FULL ++ bool ++ + choice + prompt "Timer tick handling" + default NO_HZ_IDLE if NO_HZ +@@ -87,8 +90,9 @@ config NO_HZ_IDLE + + Most of the time you want to say Y here. + +-config NO_HZ_FULL ++config NO_HZ_FULL_NODEF + bool "Full dynticks system (tickless)" ++ select NO_HZ_FULL + # NO_HZ_COMMON dependency + depends on !ARCH_USES_GETTIMEOFFSET && GENERIC_CLOCKEVENTS + # We need at least one periodic CPU for timekeeping +@@ -114,6 +118,8 @@ config NO_HZ_FULL + transitions: syscalls, exceptions and interrupts. Even when it's + dynamically off. + ++ Not recommended for desktops,laptops, or mobile devices. ++ + Say N. + + endchoice +diff --git a/kernel/time/clockevents.c b/kernel/time/clockevents.c +index f5490222e134..544c58c29267 100644 +--- a/kernel/time/clockevents.c ++++ b/kernel/time/clockevents.c +@@ -190,8 +190,9 @@ int clockevents_tick_resume(struct clock_event_device *dev) + + #ifdef CONFIG_GENERIC_CLOCKEVENTS_MIN_ADJUST + +-/* Limit min_delta to a jiffie */ +-#define MIN_DELTA_LIMIT (NSEC_PER_SEC / HZ) ++int __read_mostly hrtimer_granularity_us = 100; ++/* Limit min_delta to 100us */ ++#define MIN_DELTA_LIMIT (hrtimer_granularity_us * NSEC_PER_USEC) + + /** + * clockevents_increase_min_delta - raise minimum delta of a clock event device +diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c +index 8de90ea31280..19523b9640cd 100644 +--- a/kernel/time/hrtimer.c ++++ b/kernel/time/hrtimer.c +@@ -2209,3 +2209,113 @@ int __sched schedule_hrtimeout(ktime_t *expires, + return schedule_hrtimeout_range(expires, 0, mode); + } + EXPORT_SYMBOL_GPL(schedule_hrtimeout); ++ ++/* ++ * As per schedule_hrtimeout but taskes a millisecond value and returns how ++ * many milliseconds are left. ++ */ ++long __sched schedule_msec_hrtimeout(long timeout) ++{ ++ struct hrtimer_sleeper t; ++ int delta, jiffs; ++ ktime_t expires; ++ ++ if (!timeout) { ++ __set_current_state(TASK_RUNNING); ++ return 0; ++ } ++ ++ jiffs = msecs_to_jiffies(timeout); ++ /* ++ * If regular timer resolution is adequate or hrtimer resolution is not ++ * (yet) better than Hz, as would occur during startup, use regular ++ * timers. ++ */ ++ if (jiffs > 4 || hrtimer_resolution >= NSEC_PER_SEC / HZ || pm_freezing) ++ return schedule_timeout(jiffs); ++ ++ delta = (timeout % 1000) * NSEC_PER_MSEC; ++ expires = ktime_set(0, delta); ++ ++ hrtimer_init_sleeper_on_stack(&t, CLOCK_MONOTONIC, HRTIMER_MODE_REL); ++ hrtimer_set_expires_range_ns(&t.timer, expires, delta); ++ ++ hrtimer_sleeper_start_expires(&t, HRTIMER_MODE_REL); ++ ++ if (likely(t.task)) ++ schedule(); ++ ++ hrtimer_cancel(&t.timer); ++ destroy_hrtimer_on_stack(&t.timer); ++ ++ __set_current_state(TASK_RUNNING); ++ ++ expires = hrtimer_expires_remaining(&t.timer); ++ timeout = ktime_to_ms(expires); ++ return timeout < 0 ? 0 : timeout; ++} ++ ++EXPORT_SYMBOL(schedule_msec_hrtimeout); ++ ++#define USECS_PER_SEC 1000000 ++extern int hrtimer_granularity_us; ++ ++static inline long schedule_usec_hrtimeout(long timeout) ++{ ++ struct hrtimer_sleeper t; ++ ktime_t expires; ++ int delta; ++ ++ if (!timeout) { ++ __set_current_state(TASK_RUNNING); ++ return 0; ++ } ++ ++ if (hrtimer_resolution >= NSEC_PER_SEC / HZ) ++ return schedule_timeout(usecs_to_jiffies(timeout)); ++ ++ if (timeout < hrtimer_granularity_us) ++ timeout = hrtimer_granularity_us; ++ delta = (timeout % USECS_PER_SEC) * NSEC_PER_USEC; ++ expires = ktime_set(0, delta); ++ ++ hrtimer_init_sleeper_on_stack(&t, CLOCK_MONOTONIC, HRTIMER_MODE_REL); ++ hrtimer_set_expires_range_ns(&t.timer, expires, delta); ++ ++ hrtimer_sleeper_start_expires(&t, HRTIMER_MODE_REL); ++ ++ if (likely(t.task)) ++ schedule(); ++ ++ hrtimer_cancel(&t.timer); ++ destroy_hrtimer_on_stack(&t.timer); ++ ++ __set_current_state(TASK_RUNNING); ++ ++ expires = hrtimer_expires_remaining(&t.timer); ++ timeout = ktime_to_us(expires); ++ return timeout < 0 ? 0 : timeout; ++} ++ ++int __read_mostly hrtimeout_min_us = 500; ++ ++long __sched schedule_min_hrtimeout(void) ++{ ++ return usecs_to_jiffies(schedule_usec_hrtimeout(hrtimeout_min_us)); ++} ++ ++EXPORT_SYMBOL(schedule_min_hrtimeout); ++ ++long __sched schedule_msec_hrtimeout_interruptible(long timeout) ++{ ++ __set_current_state(TASK_INTERRUPTIBLE); ++ return schedule_msec_hrtimeout(timeout); ++} ++EXPORT_SYMBOL(schedule_msec_hrtimeout_interruptible); ++ ++long __sched schedule_msec_hrtimeout_uninterruptible(long timeout) ++{ ++ __set_current_state(TASK_UNINTERRUPTIBLE); ++ return schedule_msec_hrtimeout(timeout); ++} ++EXPORT_SYMBOL(schedule_msec_hrtimeout_uninterruptible); +diff --git a/kernel/time/posix-cpu-timers.c b/kernel/time/posix-cpu-timers.c +index 42d512fcfda2..0db83bdf7f39 100644 +--- a/kernel/time/posix-cpu-timers.c ++++ b/kernel/time/posix-cpu-timers.c +@@ -226,7 +226,7 @@ static void task_sample_cputime(struct task_struct *p, u64 *samples) + u64 stime, utime; + + task_cputime(p, &utime, &stime); +- store_samples(samples, stime, utime, p->se.sum_exec_runtime); ++ store_samples(samples, stime, utime, tsk_seruntime(p)); + } + + static void proc_sample_cputime_atomic(struct task_cputime_atomic *at, +@@ -845,7 +845,7 @@ static void check_thread_timers(struct task_struct *tsk, + soft = task_rlimit(tsk, RLIMIT_RTTIME); + if (soft != RLIM_INFINITY) { + /* Task RT timeout is accounted in jiffies. RTTIME is usec */ +- unsigned long rttime = tsk->rt.timeout * (USEC_PER_SEC / HZ); ++ unsigned long rttime = tsk_rttimeout(tsk) * (USEC_PER_SEC / HZ); + unsigned long hard = task_rlimit_max(tsk, RLIMIT_RTTIME); + + /* At the hard limit, send SIGKILL. No further action. */ +diff --git a/kernel/time/timer.c b/kernel/time/timer.c +index 4820823515e9..13034cc7c9a4 100644 +--- a/kernel/time/timer.c ++++ b/kernel/time/timer.c +@@ -43,6 +43,7 @@ + #include + #include + #include ++#include + + #include + #include +@@ -1567,7 +1568,7 @@ static unsigned long __next_timer_interrupt(struct timer_base *base) + * Check, if the next hrtimer event is before the next timer wheel + * event: + */ +-static u64 cmp_next_hrtimer_event(u64 basem, u64 expires) ++static u64 cmp_next_hrtimer_event(struct timer_base *base, u64 basem, u64 expires) + { + u64 nextevt = hrtimer_get_next_event(); + +@@ -1585,6 +1586,9 @@ static u64 cmp_next_hrtimer_event(u64 basem, u64 expires) + if (nextevt <= basem) + return basem; + ++ if (nextevt < expires && nextevt - basem <= TICK_NSEC) ++ base->is_idle = false; ++ + /* + * Round up to the next jiffie. High resolution timers are + * off, so the hrtimers are expired in the tick and we need to +@@ -1654,7 +1658,7 @@ u64 get_next_timer_interrupt(unsigned long basej, u64 basem) + } + raw_spin_unlock(&base->lock); + +- return cmp_next_hrtimer_event(basem, expires); ++ return cmp_next_hrtimer_event(base, basem, expires); + } + + /** +@@ -1889,6 +1893,18 @@ signed long __sched schedule_timeout(signed long timeout) + + expire = timeout + jiffies; + ++#ifdef CONFIG_HIGH_RES_TIMERS ++ if (timeout == 1 && hrtimer_resolution < NSEC_PER_SEC / HZ) { ++ /* ++ * Special case 1 as being a request for the minimum timeout ++ * and use highres timers to timeout after 1ms to workaround ++ * the granularity of low Hz tick timers. ++ */ ++ if (!schedule_min_hrtimeout()) ++ return 0; ++ goto out_timeout; ++ } ++#endif + timer.task = current; + timer_setup_on_stack(&timer.timer, process_timeout, 0); + __mod_timer(&timer.timer, expire, 0); +@@ -1897,10 +1913,10 @@ signed long __sched schedule_timeout(signed long timeout) + + /* Remove the timer from the object tracker */ + destroy_timer_on_stack(&timer.timer); +- ++out_timeout: + timeout = expire - jiffies; + +- out: ++out: + return timeout < 0 ? 0 : timeout; + } + EXPORT_SYMBOL(schedule_timeout); +@@ -2042,7 +2058,19 @@ void __init init_timers(void) + */ + void msleep(unsigned int msecs) + { +- unsigned long timeout = msecs_to_jiffies(msecs) + 1; ++ int jiffs = msecs_to_jiffies(msecs); ++ unsigned long timeout; ++ ++ /* ++ * Use high resolution timers where the resolution of tick based ++ * timers is inadequate. ++ */ ++ if (jiffs < 5 && hrtimer_resolution < NSEC_PER_SEC / HZ && !pm_freezing) { ++ while (msecs) ++ msecs = schedule_msec_hrtimeout_uninterruptible(msecs); ++ return; ++ } ++ timeout = jiffs + 1; + + while (timeout) + timeout = schedule_timeout_uninterruptible(timeout); +@@ -2056,7 +2084,15 @@ EXPORT_SYMBOL(msleep); + */ + unsigned long msleep_interruptible(unsigned int msecs) + { +- unsigned long timeout = msecs_to_jiffies(msecs) + 1; ++ int jiffs = msecs_to_jiffies(msecs); ++ unsigned long timeout; ++ ++ if (jiffs < 5 && hrtimer_resolution < NSEC_PER_SEC / HZ && !pm_freezing) { ++ while (msecs && !signal_pending(current)) ++ msecs = schedule_msec_hrtimeout_interruptible(msecs); ++ return msecs; ++ } ++ timeout = jiffs + 1; + + while (timeout && !signal_pending(current)) + timeout = schedule_timeout_interruptible(timeout); +diff --git a/kernel/trace/trace_selftest.c b/kernel/trace/trace_selftest.c +index 69ee8ef12cee..6edb01f2fd81 100644 +--- a/kernel/trace/trace_selftest.c ++++ b/kernel/trace/trace_selftest.c +@@ -1048,10 +1048,15 @@ static int trace_wakeup_test_thread(void *data) + { + /* Make this a -deadline thread */ + static const struct sched_attr attr = { ++#ifdef CONFIG_SCHED_MUQSS ++ /* No deadline on MuQSS, use RR */ ++ .sched_policy = SCHED_RR, ++#else + .sched_policy = SCHED_DEADLINE, + .sched_runtime = 100000ULL, + .sched_deadline = 10000000ULL, + .sched_period = 10000000ULL ++#endif + }; + struct wakeup_test_data *x = data; + +diff --git a/mm/vmscan.c b/mm/vmscan.c +index 572fb17c6273..0e42b5dc689e 100644 +--- a/mm/vmscan.c ++++ b/mm/vmscan.c +@@ -177,7 +177,7 @@ struct scan_control { + /* + * From 0 .. 100. Higher means more swappy. + */ +-int vm_swappiness = 60; ++int vm_swappiness = 33; + /* + * The total number of pages which are beyond the high watermark within all + * zones. +diff --git a/net/core/pktgen.c b/net/core/pktgen.c +index 294bfcf0ce0e..81d0fce8cdee 100644 +--- a/net/core/pktgen.c ++++ b/net/core/pktgen.c +@@ -1894,7 +1894,7 @@ static void pktgen_mark_device(const struct pktgen_net *pn, const char *ifname) + mutex_unlock(&pktgen_thread_lock); + pr_debug("%s: waiting for %s to disappear....\n", + __func__, ifname); +- schedule_timeout_interruptible(msecs_to_jiffies(msec_per_try)); ++ schedule_msec_hrtimeout_interruptible((msec_per_try)); + mutex_lock(&pktgen_thread_lock); + + if (++i >= max_tries) { +diff --git a/sound/pci/maestro3.c b/sound/pci/maestro3.c +index cc8594d76c70..6752b44dd154 100644 +--- a/sound/pci/maestro3.c ++++ b/sound/pci/maestro3.c +@@ -2002,7 +2002,7 @@ static void snd_m3_ac97_reset(struct snd_m3 *chip) + outw(0, io + GPIO_DATA); + outw(dir | GPO_PRIMARY_AC97, io + GPIO_DIRECTION); + +- schedule_timeout_uninterruptible(msecs_to_jiffies(delay1)); ++ schedule_msec_hrtimeout_uninterruptible((delay1)); + + outw(GPO_PRIMARY_AC97, io + GPIO_DATA); + udelay(5); +@@ -2010,7 +2010,7 @@ static void snd_m3_ac97_reset(struct snd_m3 *chip) + outw(IO_SRAM_ENABLE | SERIAL_AC_LINK_ENABLE, io + RING_BUS_CTRL_A); + outw(~0, io + GPIO_MASK); + +- schedule_timeout_uninterruptible(msecs_to_jiffies(delay2)); ++ schedule_msec_hrtimeout_uninterruptible((delay2)); + + if (! snd_m3_try_read_vendor(chip)) + break; +diff --git a/sound/soc/codecs/rt5631.c b/sound/soc/codecs/rt5631.c +index f70b9f7e68bb..77b65398ca07 100644 +--- a/sound/soc/codecs/rt5631.c ++++ b/sound/soc/codecs/rt5631.c +@@ -415,7 +415,7 @@ static void onebit_depop_mute_stage(struct snd_soc_component *component, int ena + hp_zc = snd_soc_component_read32(component, RT5631_INT_ST_IRQ_CTRL_2); + snd_soc_component_write(component, RT5631_INT_ST_IRQ_CTRL_2, hp_zc & 0xf7ff); + if (enable) { +- schedule_timeout_uninterruptible(msecs_to_jiffies(10)); ++ schedule_msec_hrtimeout_uninterruptible((10)); + /* config one-bit depop parameter */ + rt5631_write_index(component, RT5631_SPK_INTL_CTRL, 0x307f); + snd_soc_component_update_bits(component, RT5631_HP_OUT_VOL, +@@ -525,7 +525,7 @@ static void depop_seq_mute_stage(struct snd_soc_component *component, int enable + hp_zc = snd_soc_component_read32(component, RT5631_INT_ST_IRQ_CTRL_2); + snd_soc_component_write(component, RT5631_INT_ST_IRQ_CTRL_2, hp_zc & 0xf7ff); + if (enable) { +- schedule_timeout_uninterruptible(msecs_to_jiffies(10)); ++ schedule_msec_hrtimeout_uninterruptible((10)); + + /* config depop sequence parameter */ + rt5631_write_index(component, RT5631_SPK_INTL_CTRL, 0x302f); +diff --git a/sound/soc/codecs/wm8350.c b/sound/soc/codecs/wm8350.c +index fe99584c917f..f1344d532a13 100644 +--- a/sound/soc/codecs/wm8350.c ++++ b/sound/soc/codecs/wm8350.c +@@ -233,10 +233,10 @@ static void wm8350_pga_work(struct work_struct *work) + out2->ramp == WM8350_RAMP_UP) { + /* delay is longer over 0dB as increases are larger */ + if (i >= WM8350_OUTn_0dB) +- schedule_timeout_interruptible(msecs_to_jiffies ++ schedule_msec_hrtimeout_interruptible( + (2)); + else +- schedule_timeout_interruptible(msecs_to_jiffies ++ schedule_msec_hrtimeout_interruptible( + (1)); + } else + udelay(50); /* doesn't matter if we delay longer */ +@@ -1120,7 +1120,7 @@ static int wm8350_set_bias_level(struct snd_soc_component *component, + (platform->dis_out4 << 6)); + + /* wait for discharge */ +- schedule_timeout_interruptible(msecs_to_jiffies ++ schedule_msec_hrtimeout_interruptible( + (platform-> + cap_discharge_msecs)); + +@@ -1136,7 +1136,7 @@ static int wm8350_set_bias_level(struct snd_soc_component *component, + WM8350_VBUFEN); + + /* wait for vmid */ +- schedule_timeout_interruptible(msecs_to_jiffies ++ schedule_msec_hrtimeout_interruptible( + (platform-> + vmid_charge_msecs)); + +@@ -1187,7 +1187,7 @@ static int wm8350_set_bias_level(struct snd_soc_component *component, + wm8350_reg_write(wm8350, WM8350_POWER_MGMT_1, pm1); + + /* wait */ +- schedule_timeout_interruptible(msecs_to_jiffies ++ schedule_msec_hrtimeout_interruptible( + (platform-> + vmid_discharge_msecs)); + +@@ -1205,7 +1205,7 @@ static int wm8350_set_bias_level(struct snd_soc_component *component, + pm1 | WM8350_OUTPUT_DRAIN_EN); + + /* wait */ +- schedule_timeout_interruptible(msecs_to_jiffies ++ schedule_msec_hrtimeout_interruptible( + (platform->drain_msecs)); + + pm1 &= ~WM8350_BIASEN; +diff --git a/sound/soc/codecs/wm8900.c b/sound/soc/codecs/wm8900.c +index 271235a69c01..3ec90e1b1eb4 100644 +--- a/sound/soc/codecs/wm8900.c ++++ b/sound/soc/codecs/wm8900.c +@@ -1109,7 +1109,7 @@ static int wm8900_set_bias_level(struct snd_soc_component *component, + /* Need to let things settle before stopping the clock + * to ensure that restart works, see "Stopping the + * master clock" in the datasheet. */ +- schedule_timeout_interruptible(msecs_to_jiffies(1)); ++ schedule_msec_hrtimeout_interruptible(1); + snd_soc_component_write(component, WM8900_REG_POWER2, + WM8900_REG_POWER2_SYSCLK_ENA); + break; +diff --git a/sound/soc/codecs/wm9713.c b/sound/soc/codecs/wm9713.c +index 6497c1ea6228..08fefeca9d82 100644 +--- a/sound/soc/codecs/wm9713.c ++++ b/sound/soc/codecs/wm9713.c +@@ -199,7 +199,7 @@ static int wm9713_voice_shutdown(struct snd_soc_dapm_widget *w, + + /* Gracefully shut down the voice interface. */ + snd_soc_component_update_bits(component, AC97_HANDSET_RATE, 0x0f00, 0x0200); +- schedule_timeout_interruptible(msecs_to_jiffies(1)); ++ schedule_msec_hrtimeout_interruptible(1); + snd_soc_component_update_bits(component, AC97_HANDSET_RATE, 0x0f00, 0x0f00); + snd_soc_component_update_bits(component, AC97_EXTENDED_MID, 0x1000, 0x1000); + +@@ -868,7 +868,7 @@ static int wm9713_set_pll(struct snd_soc_component *component, + wm9713->pll_in = freq_in; + + /* wait 10ms AC97 link frames for the link to stabilise */ +- schedule_timeout_interruptible(msecs_to_jiffies(10)); ++ schedule_msec_hrtimeout_interruptible((10)); + return 0; + } + +diff --git a/sound/soc/soc-dapm.c b/sound/soc/soc-dapm.c +index b6378f025836..5f5e58655d32 100644 +--- a/sound/soc/soc-dapm.c ++++ b/sound/soc/soc-dapm.c +@@ -154,7 +154,7 @@ static void dapm_assert_locked(struct snd_soc_dapm_context *dapm) + static void pop_wait(u32 pop_time) + { + if (pop_time) +- schedule_timeout_uninterruptible(msecs_to_jiffies(pop_time)); ++ schedule_msec_hrtimeout_uninterruptible((pop_time)); + } + + __printf(3, 4) +diff --git a/sound/usb/line6/pcm.c b/sound/usb/line6/pcm.c +index 9c437c716cfd..4e0a02f07bf5 100644 +--- a/sound/usb/line6/pcm.c ++++ b/sound/usb/line6/pcm.c +@@ -127,7 +127,7 @@ static void line6_wait_clear_audio_urbs(struct snd_line6_pcm *line6pcm, + if (!alive) + break; + set_current_state(TASK_UNINTERRUPTIBLE); +- schedule_timeout(1); ++ schedule_min_hrtimeout(); + } while (--timeout > 0); + if (alive) + dev_err(line6pcm->line6->ifcdev, diff --git a/linux56-rc-tkg/linux56-tkg-patches/0004-glitched-muqss.patch b/linux56-rc-tkg/linux56-tkg-patches/0004-glitched-muqss.patch new file mode 100644 index 0000000..2c4837e --- /dev/null +++ b/linux56-rc-tkg/linux56-tkg-patches/0004-glitched-muqss.patch @@ -0,0 +1,78 @@ +From f7f49141a5dbe9c99d78196b58c44307fb2e6be3 Mon Sep 17 00:00:00 2001 +From: Tk-Glitch +Date: Wed, 4 Jul 2018 04:30:08 +0200 +Subject: glitched - MuQSS + +diff --git a/kernel/sched/MuQSS.c b/kernel/sched/MuQSS.c +index 84a1d08d68551..57c3036a68952 100644 +--- a/kernel/sched/MuQSS.c ++++ b/kernel/sched/MuQSS.c +@@ -163,7 +167,11 @@ int sched_interactive __read_mostly = 1; + * are allowed to run five seconds as real time tasks. This is the total over + * all online cpus. + */ ++#ifdef CONFIG_ZENIFY ++int sched_iso_cpu __read_mostly = 25; ++#else + int sched_iso_cpu __read_mostly = 70; ++#endif + + /* + * sched_yield_type - Choose what sort of yield sched_yield will perform. + +diff --git a/kernel/Kconfig.hz b/kernel/Kconfig.hz +index 2a202a846757..1d9c7ed79b11 100644 +--- a/kernel/Kconfig.hz ++++ b/kernel/Kconfig.hz +@@ -5,7 +5,7 @@ + choice + prompt "Timer frequency" + default HZ_100 if SCHED_MUQSS +- default HZ_250_NODEF if !SCHED_MUQSS ++ default HZ_500_NODEF if !SCHED_MUQSS + help + Allows the configuration of the timer frequency. It is customary + to have the timer interrupt run at 1000 Hz but 100 Hz may be more +@@ -50,6 +50,20 @@ choice + on SMP and NUMA systems and exactly dividing by both PAL and + NTSC frame rates for video and multimedia work. + ++ config HZ_500_NODEF ++ bool "500 HZ" ++ help ++ 500 Hz is a good timer frequency for desktops. Provides fast ++ interactivity with great smoothness without sacrificing too ++ much throughput. ++ ++ config HZ_750_NODEF ++ bool "750 HZ" ++ help ++ 750 Hz is a good timer frequency for desktops. Provides fast ++ interactivity with great smoothness without sacrificing too ++ much throughput. ++ + config HZ_1000_NODEF + bool "1000 HZ" + help +@@ -63,6 +70,8 @@ config HZ + default 100 if HZ_100 + default 250 if HZ_250_NODEF + default 300 if HZ_300_NODEF ++ default 500 if HZ_500_NODEF ++ default 750 if HZ_750_NODEF + default 1000 if HZ_1000_NODEF + + config SCHED_HRTICK + +diff --git a/Makefile b/Makefile +index d4d36c61940b..4a9dfe471f1f 100644 +--- a/Makefile ++++ b/Makefile +@@ -15,7 +15,6 @@ NAME = Kleptomaniac Octopus + + CKVERSION = -ck1 + CKNAME = MuQSS Powered +-EXTRAVERSION := $(EXTRAVERSION)$(CKVERSION) + + # We are using a recursive build, so we need to do a little thinking + # to get the ordering right. diff --git a/linux56-rc-tkg/linux56-tkg-patches/0004-glitched-ondemand-muqss.patch b/linux56-rc-tkg/linux56-tkg-patches/0004-glitched-ondemand-muqss.patch new file mode 100644 index 0000000..02933e4 --- /dev/null +++ b/linux56-rc-tkg/linux56-tkg-patches/0004-glitched-ondemand-muqss.patch @@ -0,0 +1,18 @@ +diff --git a/drivers/cpufreq/cpufreq_ondemand.c b/drivers/cpufreq/cpufreq_ondemand.c +index 6b423eebfd5d..61e3271675d6 100644 +--- a/drivers/cpufreq/cpufreq_ondemand.c ++++ b/drivers/cpufreq/cpufreq_ondemand.c +@@ -21,10 +21,10 @@ + #include "cpufreq_ondemand.h" + + /* On-demand governor macros */ +-#define DEF_FREQUENCY_UP_THRESHOLD (80) +-#define DEF_SAMPLING_DOWN_FACTOR (1) ++#define DEF_FREQUENCY_UP_THRESHOLD (45) ++#define DEF_SAMPLING_DOWN_FACTOR (5) + #define MAX_SAMPLING_DOWN_FACTOR (100000) +-#define MICRO_FREQUENCY_UP_THRESHOLD (95) ++#define MICRO_FREQUENCY_UP_THRESHOLD (45) + #define MICRO_FREQUENCY_MIN_SAMPLE_RATE (10000) + #define MIN_FREQUENCY_UP_THRESHOLD (1) + #define MAX_FREQUENCY_UP_THRESHOLD (100) diff --git a/linux56-rc-tkg/linux56-tkg-patches/0005-glitched-ondemand-pds.patch b/linux56-rc-tkg/linux56-tkg-patches/0005-glitched-ondemand-pds.patch new file mode 100644 index 0000000..c1929e8 --- /dev/null +++ b/linux56-rc-tkg/linux56-tkg-patches/0005-glitched-ondemand-pds.patch @@ -0,0 +1,18 @@ +diff --git a/drivers/cpufreq/cpufreq_ondemand.c b/drivers/cpufreq/cpufreq_ondemand.c +index 6b423eebfd5d..61e3271675d6 100644 +--- a/drivers/cpufreq/cpufreq_ondemand.c ++++ b/drivers/cpufreq/cpufreq_ondemand.c +@@ -21,10 +21,10 @@ + #include "cpufreq_ondemand.h" + + /* On-demand governor macros */ +-#define DEF_FREQUENCY_UP_THRESHOLD (63) +-#define DEF_SAMPLING_DOWN_FACTOR (1) ++#define DEF_FREQUENCY_UP_THRESHOLD (55) ++#define DEF_SAMPLING_DOWN_FACTOR (5) + #define MAX_SAMPLING_DOWN_FACTOR (100000) +-#define MICRO_FREQUENCY_UP_THRESHOLD (95) ++#define MICRO_FREQUENCY_UP_THRESHOLD (63) + #define MICRO_FREQUENCY_MIN_SAMPLE_RATE (10000) + #define MIN_FREQUENCY_UP_THRESHOLD (1) + #define MAX_FREQUENCY_UP_THRESHOLD (100) diff --git a/linux56-rc-tkg/linux56-tkg-patches/0005-glitched-pds.patch b/linux56-rc-tkg/linux56-tkg-patches/0005-glitched-pds.patch new file mode 100644 index 0000000..23271f5 --- /dev/null +++ b/linux56-rc-tkg/linux56-tkg-patches/0005-glitched-pds.patch @@ -0,0 +1,166 @@ +From f7f49141a5dbe9c99d78196b58c44307fb2e6be3 Mon Sep 17 00:00:00 2001 +From: Tk-Glitch +Date: Wed, 4 Jul 2018 04:30:08 +0200 +Subject: glitched - PDS + +diff --git a/kernel/Kconfig.hz b/kernel/Kconfig.hz +index 2a202a846757..1d9c7ed79b11 100644 +--- a/kernel/Kconfig.hz ++++ b/kernel/Kconfig.hz +@@ -4,7 +4,7 @@ + + choice + prompt "Timer frequency" +- default HZ_250 ++ default HZ_500 + help + Allows the configuration of the timer frequency. It is customary + to have the timer interrupt run at 1000 Hz but 100 Hz may be more +@@ -39,6 +39,13 @@ choice + on SMP and NUMA systems and exactly dividing by both PAL and + NTSC frame rates for video and multimedia work. + ++ config HZ_500 ++ bool "500 HZ" ++ help ++ 500 Hz is a balanced timer frequency. Provides fast interactivity ++ on desktops with great smoothness without increasing CPU power ++ consumption and sacrificing the battery life on laptops. ++ + config HZ_1000 + bool "1000 HZ" + help +@@ -52,6 +59,7 @@ config HZ + default 100 if HZ_100 + default 250 if HZ_250 + default 300 if HZ_300 ++ default 500 if HZ_500 + default 1000 if HZ_1000 + + config SCHED_HRTICK + +diff --git a/kernel/Kconfig.hz b/kernel/Kconfig.hz +index 2a202a846757..1d9c7ed79b11 100644 +--- a/kernel/Kconfig.hz ++++ b/kernel/Kconfig.hz +@@ -4,7 +4,7 @@ + + choice + prompt "Timer frequency" +- default HZ_500 ++ default HZ_750 + help + Allows the configuration of the timer frequency. It is customary + to have the timer interrupt run at 1000 Hz but 100 Hz may be more +@@ -46,6 +46,13 @@ choice + on desktops with great smoothness without increasing CPU power + consumption and sacrificing the battery life on laptops. + ++ config HZ_750 ++ bool "750 HZ" ++ help ++ 750 Hz is a good timer frequency for desktops. Provides fast ++ interactivity with great smoothness without sacrificing too ++ much throughput. ++ + config HZ_1000 + bool "1000 HZ" + help +@@ -60,6 +67,7 @@ config HZ + default 250 if HZ_250 + default 300 if HZ_300 + default 500 if HZ_500 ++ default 750 if HZ_750 + default 1000 if HZ_1000 + + config SCHED_HRTICK + +diff --git a/mm/vmscan.c b/mm/vmscan.c +index 9270a4370d54..30d01e647417 100644 +--- a/mm/vmscan.c ++++ b/mm/vmscan.c +@@ -159,7 +159,7 @@ struct scan_control { + /* + * From 0 .. 100. Higher means more swappy. + */ +-int vm_swappiness = 60; ++int vm_swappiness = 20; + /* + * The total number of pages which are beyond the high watermark within all + * zones. + +diff --git a/init/Kconfig b/init/Kconfig +index 11fd9b502d06..e9bc34d3019b 100644 +--- a/init/Kconfig ++++ b/init/Kconfig +@@ -715,6 +715,7 @@ menu "Scheduler features" + config UCLAMP_TASK + bool "Enable utilization clamping for RT/FAIR tasks" + depends on CPU_FREQ_GOV_SCHEDUTIL ++ depends on !SCHED_PDS + help + This feature enables the scheduler to track the clamped utilization + of each CPU based on RUNNABLE tasks scheduled on that CPU. +@@ -948,7 +948,6 @@ config CGROUP_DEVICE + + config CGROUP_CPUACCT + bool "Simple CPU accounting controller" +- depends on !SCHED_PDS + help + Provides a simple controller for monitoring the + total CPU consumed by the tasks in a cgroup. +diff --git a/kernel/sched/Makefile b/kernel/sched/Makefile +index b23231bae996..cab4e5c5b38e 100644 +--- a/kernel/sched/Makefile ++++ b/kernel/sched/Makefile +@@ -24,13 +24,13 @@ obj-y += fair.o rt.o deadline.o + obj-$(CONFIG_SMP) += cpudeadline.o topology.o stop_task.o + obj-$(CONFIG_SCHED_AUTOGROUP) += autogroup.o + obj-$(CONFIG_SCHED_DEBUG) += debug.o +-obj-$(CONFIG_CGROUP_CPUACCT) += cpuacct.o + endif + obj-y += loadavg.o clock.o cputime.o + obj-y += idle.o + obj-y += wait.o wait_bit.o swait.o completion.o + obj-$(CONFIG_SMP) += cpupri.o pelt.o + obj-$(CONFIG_SCHEDSTATS) += stats.o ++obj-$(CONFIG_CGROUP_CPUACCT) += cpuacct.o + obj-$(CONFIG_CPU_FREQ) += cpufreq.o + obj-$(CONFIG_CPU_FREQ_GOV_SCHEDUTIL) += cpufreq_schedutil.o + obj-$(CONFIG_MEMBARRIER) += membarrier.o + +diff --git a/kernel/sched/pds.c b/kernel/sched/pds.c +index 9281ad164..f09a609cf 100644 +--- a/kernel/sched/pds.c ++++ b/kernel/sched/pds.c +@@ -81,6 +81,18 @@ enum { + NR_CPU_AFFINITY_CHK_LEVEL + }; + ++/* ++ * This allows printing both to /proc/sched_debug and ++ * to the console ++ */ ++#define SEQ_printf(m, x...) \ ++ do { \ ++ if (m) \ ++ seq_printf(m, x); \ ++ else \ ++ pr_cont(x); \ ++ } while (0) ++ + static inline void print_scheduler_version(void) + { + printk(KERN_INFO "pds: PDS-mq CPU Scheduler 0.99o by Alfred Chen.\n"); +@@ -6353,7 +6365,10 @@ void ia64_set_curr_task(int cpu, struct task_struct *p) + #ifdef CONFIG_SCHED_DEBUG + void proc_sched_show_task(struct task_struct *p, struct pid_namespace *ns, + struct seq_file *m) +-{} ++{ ++ SEQ_printf(m, "%s (%d, #threads: %d)\n", p->comm, task_pid_nr_ns(p, ns), ++ get_nr_threads(p)); ++} + + void proc_sched_set_task(struct task_struct *p) + {} diff --git a/linux56-rc-tkg/linux56-tkg-patches/0005-v5.6_undead-pds099o.patch b/linux56-rc-tkg/linux56-tkg-patches/0005-v5.6_undead-pds099o.patch new file mode 100644 index 0000000..0e1c215 --- /dev/null +++ b/linux56-rc-tkg/linux56-tkg-patches/0005-v5.6_undead-pds099o.patch @@ -0,0 +1,8369 @@ +From 89067d28ca90681fc6cf108de79b9aedb93dfa9d Mon Sep 17 00:00:00 2001 +From: Tk-Glitch +Date: Mon, 9 Dec 2019 7:11:23 +0100 +Subject: PDS 099o, 5.5rc1 rebase + + +diff --git a/Documentation/admin-guide/sysctl/kernel.rst b/Documentation/admin-guide/sysctl/kernel.rst +index 032c7cd3cede..360a229b0abe 100644 +--- a/Documentation/admin-guide/sysctl/kernel.rst ++++ b/Documentation/admin-guide/sysctl/kernel.rst +@@ -82,6 +82,7 @@ show up in /proc/sys/kernel: + - randomize_va_space + - real-root-dev ==> Documentation/admin-guide/initrd.rst + - reboot-cmd [ SPARC only ] ++- rr_interval + - rtsig-max + - rtsig-nr + - sched_energy_aware +@@ -105,6 +106,7 @@ show up in /proc/sys/kernel: + - unknown_nmi_panic + - watchdog + - watchdog_thresh ++- yield_type + - version + + +diff --git a/Documentation/scheduler/sched-PDS-mq.txt b/Documentation/scheduler/sched-PDS-mq.txt +new file mode 100644 +index 000000000000..709e86f6487e +--- /dev/null ++++ b/Documentation/scheduler/sched-PDS-mq.txt +@@ -0,0 +1,56 @@ ++ Priority and Deadline based Skiplist multiple queue Scheduler ++ ------------------------------------------------------------- ++ ++CONTENT ++======== ++ ++ 0. Development ++ 1. Overview ++ 1.1 Design goal ++ 1.2 Design summary ++ 2. Design Detail ++ 2.1 Skip list implementation ++ 2.2 Task preempt ++ 2.3 Task policy, priority and deadline ++ 2.4 Task selection ++ 2.5 Run queue balance ++ 2.6 Task migration ++ ++ ++0. Development ++============== ++ ++Priority and Deadline based Skiplist multiple queue scheduler, referred to as ++PDS from here on, is developed upon the enhancement patchset VRQ(Variable Run ++Queue) for BFS(Brain Fuck Scheduler by Con Kolivas). PDS inherits the existing ++design from VRQ and inspired by the introduction of skiplist data structure ++to the scheduler by Con Kolivas. However, PDS is different from MuQSS(Multiple ++Queue Skiplist Scheduler, the successor after BFS) in many ways. ++ ++1. Overview ++=========== ++ ++1.1 Design goal ++--------------- ++ ++PDS is designed to make the cpu process scheduler code to be simple, but while ++efficiency and scalable. Be Simple, the scheduler code will be easy to be read ++and the behavious of scheduler will be easy to predict. Be efficiency, the ++scheduler shall be well balance the thoughput performance and task interactivity ++at the same time for different properties the tasks behave. Be scalable, the ++performance of the scheduler should be in good shape with the glowing of ++workload or with the growing of the cpu numbers. ++ ++1.2 Design summary ++------------------ ++ ++PDS is described as a multiple run queues cpu scheduler. Each cpu has its own ++run queue. A heavry customized skiplist is used as the backend data structure ++of the cpu run queue. Tasks in run queue is sorted by priority then virtual ++deadline(simplfy to just deadline from here on). In PDS, balance action among ++run queues are kept as less as possible to reduce the migration cost. Cpumask ++data structure is widely used in cpu affinity checking and cpu preemption/ ++selection to make PDS scalable with increasing cpu number. ++ ++ ++To be continued... +diff --git a/arch/powerpc/platforms/cell/spufs/sched.c b/arch/powerpc/platforms/cell/spufs/sched.c +index f18d5067cd0f..fe489fc01c73 100644 +--- a/arch/powerpc/platforms/cell/spufs/sched.c ++++ b/arch/powerpc/platforms/cell/spufs/sched.c +@@ -51,11 +51,6 @@ static struct task_struct *spusched_task; + static struct timer_list spusched_timer; + static struct timer_list spuloadavg_timer; + +-/* +- * Priority of a normal, non-rt, non-niced'd process (aka nice level 0). +- */ +-#define NORMAL_PRIO 120 +- + /* + * Frequency of the spu scheduler tick. By default we do one SPU scheduler + * tick for every 10 CPU scheduler ticks. +diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig +index 8ef85139553f..9d44d8d78259 100644 +--- a/arch/x86/Kconfig ++++ b/arch/x86/Kconfig +@@ -1034,6 +1034,22 @@ config NR_CPUS + config SCHED_SMT + def_bool y if SMP + ++config SMT_NICE ++ bool "SMT (Hyperthreading) aware nice priority and policy support" ++ depends on SCHED_PDS && SCHED_SMT ++ default y ++ ---help--- ++ Enabling Hyperthreading on Intel CPUs decreases the effectiveness ++ of the use of 'nice' levels and different scheduling policies ++ (e.g. realtime) due to sharing of CPU power between hyperthreads. ++ SMT nice support makes each logical CPU aware of what is running on ++ its hyperthread siblings, maintaining appropriate distribution of ++ CPU according to nice levels and scheduling policies at the expense ++ of slightly increased overhead. ++ ++ If unsure say Y here. ++ ++ + config SCHED_MC + def_bool y + prompt "Multi-core scheduler support" +diff --git a/drivers/cpufreq/cpufreq_conservative.c b/drivers/cpufreq/cpufreq_conservative.c +index b66e81c06a57..a294f8f5fd75 100644 +--- a/drivers/cpufreq/cpufreq_conservative.c ++++ b/drivers/cpufreq/cpufreq_conservative.c +@@ -28,8 +28,8 @@ struct cs_dbs_tuners { + }; + + /* Conservative governor macros */ +-#define DEF_FREQUENCY_UP_THRESHOLD (80) +-#define DEF_FREQUENCY_DOWN_THRESHOLD (20) ++#define DEF_FREQUENCY_UP_THRESHOLD (63) ++#define DEF_FREQUENCY_DOWN_THRESHOLD (26) + #define DEF_FREQUENCY_STEP (5) + #define DEF_SAMPLING_DOWN_FACTOR (1) + #define MAX_SAMPLING_DOWN_FACTOR (10) +diff --git a/drivers/cpufreq/cpufreq_ondemand.c b/drivers/cpufreq/cpufreq_ondemand.c +index dced033875bf..d2cd03766b09 100644 +--- a/drivers/cpufreq/cpufreq_ondemand.c ++++ b/drivers/cpufreq/cpufreq_ondemand.c +@@ -18,7 +18,7 @@ + #include "cpufreq_ondemand.h" + + /* On-demand governor macros */ +-#define DEF_FREQUENCY_UP_THRESHOLD (80) ++#define DEF_FREQUENCY_UP_THRESHOLD (63) + #define DEF_SAMPLING_DOWN_FACTOR (1) + #define MAX_SAMPLING_DOWN_FACTOR (100000) + #define MICRO_FREQUENCY_UP_THRESHOLD (95) +@@ -127,7 +127,7 @@ static void dbs_freq_increase(struct cpufreq_policy *policy, unsigned int freq) + } + + /* +- * Every sampling_rate, we check, if current idle time is less than 20% ++ * Every sampling_rate, we check, if current idle time is less than 37% + * (default), then we try to increase frequency. Else, we adjust the frequency + * proportional to load. + */ +diff --git a/fs/proc/base.c b/fs/proc/base.c +index ebea9501afb8..51c9346a69fe 100644 +--- a/fs/proc/base.c ++++ b/fs/proc/base.c +@@ -477,7 +477,7 @@ static int proc_pid_schedstat(struct seq_file *m, struct pid_namespace *ns, + seq_puts(m, "0 0 0\n"); + else + seq_printf(m, "%llu %llu %lu\n", +- (unsigned long long)task->se.sum_exec_runtime, ++ (unsigned long long)tsk_seruntime(task), + (unsigned long long)task->sched_info.run_delay, + task->sched_info.pcount); + +diff --git a/include/linux/init_task.h b/include/linux/init_task.h +index 2c620d7ac432..1a7987c40c80 100644 +--- a/include/linux/init_task.h ++++ b/include/linux/init_task.h +@@ -36,7 +36,11 @@ extern struct cred init_cred; + #define INIT_PREV_CPUTIME(x) + #endif + ++#ifdef CONFIG_SCHED_PDS ++#define INIT_TASK_COMM "PDS" ++#else + #define INIT_TASK_COMM "swapper" ++#endif /* !CONFIG_SCHED_PDS */ + + /* Attach to the init_task data structure for proper alignment */ + #ifdef CONFIG_ARCH_TASK_STRUCT_ON_STACK +diff --git a/include/linux/jiffies.h b/include/linux/jiffies.h +index 1b6d31da7cbc..dea181bdb1dd 100644 +--- a/include/linux/jiffies.h ++++ b/include/linux/jiffies.h +@@ -171,7 +171,7 @@ static inline u64 get_jiffies_64(void) + * Have the 32 bit jiffies value wrap 5 minutes after boot + * so jiffies wrap bugs show up earlier. + */ +-#define INITIAL_JIFFIES ((unsigned long)(unsigned int) (-300*HZ)) ++#define INITIAL_JIFFIES ((unsigned long)(unsigned int) (-10*HZ)) + + /* + * Change timeval to jiffies, trying to avoid the +diff --git a/include/linux/sched.h b/include/linux/sched.h +index 67a1d86981a9..8268cad4b0a2 100644 +--- a/include/linux/sched.h ++++ b/include/linux/sched.h +@@ -31,6 +31,7 @@ + #include + #include + #include ++#include + + /* task_struct member predeclarations (sorted alphabetically): */ + struct audit_context; +@@ -644,9 +645,13 @@ struct task_struct { + unsigned int flags; + unsigned int ptrace; + +-#ifdef CONFIG_SMP ++#if defined(CONFIG_SMP) && !defined(CONFIG_SCHED_PDS) + struct llist_node wake_entry; ++#endif ++#if defined(CONFIG_SMP) || defined(CONFIG_SCHED_PDS) + int on_cpu; ++#endif ++#ifdef CONFIG_SMP + #ifdef CONFIG_THREAD_INFO_IN_TASK + /* Current CPU: */ + unsigned int cpu; +@@ -655,6 +660,7 @@ struct task_struct { + unsigned long wakee_flip_decay_ts; + struct task_struct *last_wakee; + ++#ifndef CONFIG_SCHED_PDS + /* + * recent_used_cpu is initially set as the last CPU used by a task + * that wakes affine another task. Waker/wakee relationships can +@@ -663,6 +669,7 @@ struct task_struct { + * used CPU that may be idle. + */ + int recent_used_cpu; ++#endif /* CONFIG_SCHED_PDS */ + int wake_cpu; + #endif + int on_rq; +@@ -672,13 +679,27 @@ struct task_struct { + int normal_prio; + unsigned int rt_priority; + ++#ifdef CONFIG_SCHED_PDS ++ int time_slice; ++ u64 deadline; ++ /* skip list level */ ++ int sl_level; ++ /* skip list node */ ++ struct skiplist_node sl_node; ++ /* 8bits prio and 56bits deadline for quick processing */ ++ u64 priodl; ++ u64 last_ran; ++ /* sched_clock time spent running */ ++ u64 sched_time; ++#else /* CONFIG_SCHED_PDS */ + const struct sched_class *sched_class; + struct sched_entity se; + struct sched_rt_entity rt; ++ struct sched_dl_entity dl; ++#endif + #ifdef CONFIG_CGROUP_SCHED + struct task_group *sched_task_group; + #endif +- struct sched_dl_entity dl; + + #ifdef CONFIG_UCLAMP_TASK + /* Clamp values requested for a scheduling entity */ +@@ -1283,6 +1304,29 @@ struct task_struct { + */ + }; + ++#ifdef CONFIG_SCHED_PDS ++void cpu_scaling(int cpu); ++void cpu_nonscaling(int cpu); ++#define tsk_seruntime(t) ((t)->sched_time) ++/* replace the uncertian rt_timeout with 0UL */ ++#define tsk_rttimeout(t) (0UL) ++ ++#define task_running_idle(p) ((p)->prio == IDLE_PRIO) ++#else /* CFS */ ++extern int runqueue_is_locked(int cpu); ++static inline void cpu_scaling(int cpu) ++{ ++} ++ ++static inline void cpu_nonscaling(int cpu) ++{ ++} ++#define tsk_seruntime(t) ((t)->se.sum_exec_runtime) ++#define tsk_rttimeout(t) ((t)->rt.timeout) ++ ++#define iso_task(p) (false) ++#endif /* CONFIG_SCHED_PDS */ ++ + static inline struct pid *task_pid(struct task_struct *task) + { + return task->thread_pid; +diff --git a/include/linux/sched/deadline.h b/include/linux/sched/deadline.h +index 1aff00b65f3c..a5e5fc2c9170 100644 +--- a/include/linux/sched/deadline.h ++++ b/include/linux/sched/deadline.h +@@ -1,5 +1,22 @@ + /* SPDX-License-Identifier: GPL-2.0 */ + ++#ifdef CONFIG_SCHED_PDS ++ ++#define __tsk_deadline(p) ((p)->deadline) ++ ++static inline int dl_prio(int prio) ++{ ++ return 1; ++} ++ ++static inline int dl_task(struct task_struct *p) ++{ ++ return 1; ++} ++#else ++ ++#define __tsk_deadline(p) ((p)->dl.deadline) ++ + /* + * SCHED_DEADLINE tasks has negative priorities, reflecting + * the fact that any of them has higher prio than RT and +@@ -19,6 +36,7 @@ static inline int dl_task(struct task_struct *p) + { + return dl_prio(p->prio); + } ++#endif /* CONFIG_SCHED_PDS */ + + static inline bool dl_time_before(u64 a, u64 b) + { +diff --git a/include/linux/sched/prio.h b/include/linux/sched/prio.h +index 7d64feafc408..fba04bb91492 100644 +--- a/include/linux/sched/prio.h ++++ b/include/linux/sched/prio.h +@@ -20,7 +20,18 @@ + */ + + #define MAX_USER_RT_PRIO 100 ++ ++#ifdef CONFIG_SCHED_PDS ++#define ISO_PRIO (MAX_USER_RT_PRIO) ++ ++#define MAX_RT_PRIO ((MAX_USER_RT_PRIO) + 1) ++ ++#define NORMAL_PRIO (MAX_RT_PRIO) ++#define IDLE_PRIO ((MAX_RT_PRIO) + 1) ++#define PRIO_LIMIT ((IDLE_PRIO) + 1) ++#else /* !CONFIG_SCHED_PDS */ + #define MAX_RT_PRIO MAX_USER_RT_PRIO ++#endif /* CONFIG_SCHED_PDS */ + + #define MAX_PRIO (MAX_RT_PRIO + NICE_WIDTH) + #define DEFAULT_PRIO (MAX_RT_PRIO + NICE_WIDTH / 2) +diff --git a/include/linux/sched/rt.h b/include/linux/sched/rt.h +index e5af028c08b4..a96012e6f15e 100644 +--- a/include/linux/sched/rt.h ++++ b/include/linux/sched/rt.h +@@ -24,8 +24,10 @@ static inline bool task_is_realtime(struct task_struct *tsk) + + if (policy == SCHED_FIFO || policy == SCHED_RR) + return true; ++#ifndef CONFIG_SCHED_PDS + if (policy == SCHED_DEADLINE) + return true; ++#endif + return false; + } + +diff --git a/include/linux/sched/task.h b/include/linux/sched/task.h +index 4b1c3b664f51..f186b8119ad6 100644 +--- a/include/linux/sched/task.h ++++ b/include/linux/sched/task.h +@@ -99,7 +99,7 @@ extern long kernel_wait4(pid_t, int __user *, int, struct rusage *); + extern void free_task(struct task_struct *tsk); + + /* sched_exec is called by processes performing an exec */ +-#ifdef CONFIG_SMP ++#if defined(CONFIG_SMP) && !defined(CONFIG_SCHED_PDS) + extern void sched_exec(void); + #else + #define sched_exec() {} +diff --git a/include/linux/skip_list.h b/include/linux/skip_list.h +new file mode 100644 +index 000000000000..713fedd8034f +--- /dev/null ++++ b/include/linux/skip_list.h +@@ -0,0 +1,177 @@ ++/* ++ Copyright (C) 2016 Alfred Chen. ++ ++ Code based on Con Kolivas's skip list implementation for BFS, and ++ which is based on example originally by William Pugh. ++ ++Skip Lists are a probabilistic alternative to balanced trees, as ++described in the June 1990 issue of CACM and were invented by ++William Pugh in 1987. ++ ++A couple of comments about this implementation: ++ ++This file only provides a infrastructure of skip list. ++ ++skiplist_node is embedded into container data structure, to get rid the ++dependency of kmalloc/kfree operation in scheduler code. ++ ++A customized search function should be defined using DEFINE_SKIPLIST_INSERT ++macro and be used for skip list insert operation. ++ ++Random Level is also not defined in this file, instead, it should be customized ++implemented and set to node->level then pass to the customized skiplist_insert ++function. ++ ++Levels start at zero and go up to (NUM_SKIPLIST_LEVEL -1) ++ ++NUM_SKIPLIST_LEVEL in this implementation is 8 instead of origin 16, ++considering that there will be 256 entries to enable the top level when using ++random level p=0.5, and that number is more than enough for a run queue usage ++in a scheduler usage. And it also help to reduce the memory usage of the ++embedded skip list node in task_struct to about 50%. ++ ++The insertion routine has been implemented so as to use the ++dirty hack described in the CACM paper: if a random level is ++generated that is more than the current maximum level, the ++current maximum level plus one is used instead. ++ ++BFS Notes: In this implementation of skiplists, there are bidirectional ++next/prev pointers and the insert function returns a pointer to the actual ++node the value is stored. The key here is chosen by the scheduler so as to ++sort tasks according to the priority list requirements and is no longer used ++by the scheduler after insertion. The scheduler lookup, however, occurs in ++O(1) time because it is always the first item in the level 0 linked list. ++Since the task struct stores a copy of the node pointer upon skiplist_insert, ++it can also remove it much faster than the original implementation with the ++aid of prev<->next pointer manipulation and no searching. ++*/ ++#ifndef _LINUX_SKIP_LIST_H ++#define _LINUX_SKIP_LIST_H ++ ++#include ++ ++#define NUM_SKIPLIST_LEVEL (8) ++ ++struct skiplist_node { ++ int level; /* Levels in this node */ ++ struct skiplist_node *next[NUM_SKIPLIST_LEVEL]; ++ struct skiplist_node *prev[NUM_SKIPLIST_LEVEL]; ++}; ++ ++#define SKIPLIST_NODE_INIT(name) { 0,\ ++ {&name, &name, &name, &name,\ ++ &name, &name, &name, &name},\ ++ {&name, &name, &name, &name,\ ++ &name, &name, &name, &name},\ ++ } ++ ++static inline void INIT_SKIPLIST_NODE(struct skiplist_node *node) ++{ ++ /* only level 0 ->next matters in skiplist_empty()*/ ++ WRITE_ONCE(node->next[0], node); ++} ++ ++/** ++ * FULL_INIT_SKIPLIST_NODE -- fully init a skiplist_node, expecially for header ++ * @node: the skip list node to be inited. ++ */ ++static inline void FULL_INIT_SKIPLIST_NODE(struct skiplist_node *node) ++{ ++ int i; ++ ++ node->level = 0; ++ for (i = 0; i < NUM_SKIPLIST_LEVEL; i++) { ++ WRITE_ONCE(node->next[i], node); ++ node->prev[i] = node; ++ } ++} ++ ++/** ++ * skiplist_empty - test whether a skip list is empty ++ * @head: the skip list to test. ++ */ ++static inline int skiplist_empty(const struct skiplist_node *head) ++{ ++ return READ_ONCE(head->next[0]) == head; ++} ++ ++/** ++ * skiplist_entry - get the struct for this entry ++ * @ptr: the &struct skiplist_node pointer. ++ * @type: the type of the struct this is embedded in. ++ * @member: the name of the skiplist_node within the struct. ++ */ ++#define skiplist_entry(ptr, type, member) \ ++ container_of(ptr, type, member) ++ ++/** ++ * DEFINE_SKIPLIST_INSERT_FUNC -- macro to define a customized skip list insert ++ * function, which takes two parameters, first one is the header node of the ++ * skip list, second one is the skip list node to be inserted ++ * @func_name: the customized skip list insert function name ++ * @search_func: the search function to be used, which takes two parameters, ++ * 1st one is the itrator of skiplist_node in the list, the 2nd is the skip list ++ * node to be inserted, the function should return true if search should be ++ * continued, otherwise return false. ++ * Returns 1 if @node is inserted as the first item of skip list at level zero, ++ * otherwise 0 ++ */ ++#define DEFINE_SKIPLIST_INSERT_FUNC(func_name, search_func)\ ++static inline int func_name(struct skiplist_node *head, struct skiplist_node *node)\ ++{\ ++ struct skiplist_node *update[NUM_SKIPLIST_LEVEL];\ ++ struct skiplist_node *p, *q;\ ++ int k = head->level;\ ++\ ++ p = head;\ ++ do {\ ++ while (q = p->next[k], q != head && search_func(q, node))\ ++ p = q;\ ++ update[k] = p;\ ++ } while (--k >= 0);\ ++\ ++ k = node->level;\ ++ if (unlikely(k > head->level)) {\ ++ node->level = k = ++head->level;\ ++ update[k] = head;\ ++ }\ ++\ ++ do {\ ++ p = update[k];\ ++ q = p->next[k];\ ++ node->next[k] = q;\ ++ p->next[k] = node;\ ++ node->prev[k] = p;\ ++ q->prev[k] = node;\ ++ } while (--k >= 0);\ ++\ ++ return (p == head);\ ++} ++ ++/** ++ * skiplist_del_init -- delete skip list node from a skip list and reset it's ++ * init state ++ * @head: the header node of the skip list to be deleted from. ++ * @node: the skip list node to be deleted, the caller need to ensure @node is ++ * in skip list which @head represent. ++ * Returns 1 if @node is the first item of skip level at level zero, otherwise 0 ++ */ ++static inline int ++skiplist_del_init(struct skiplist_node *head, struct skiplist_node *node) ++{ ++ int l, m = node->level; ++ ++ for (l = 0; l <= m; l++) { ++ node->prev[l]->next[l] = node->next[l]; ++ node->next[l]->prev[l] = node->prev[l]; ++ } ++ if (m == head->level && m > 0) { ++ while (head->next[m] == head && m > 0) ++ m--; ++ head->level = m; ++ } ++ INIT_SKIPLIST_NODE(node); ++ ++ return (node->prev[0] == head); ++} ++#endif /* _LINUX_SKIP_LIST_H */ +diff --git a/include/uapi/linux/sched.h b/include/uapi/linux/sched.h +index 25b4fa00bad1..fc0aabdce15f 100644 +--- a/include/uapi/linux/sched.h ++++ b/include/uapi/linux/sched.h +@@ -84,7 +84,10 @@ struct clone_args { + #define SCHED_FIFO 1 + #define SCHED_RR 2 + #define SCHED_BATCH 3 +-/* SCHED_ISO: reserved but not implemented yet */ ++/* SCHED_ISO: Implemented in BFS/MuQSSPDS only */ ++#ifdef CONFIG_SCHED_PDS ++#define SCHED_ISO 4 ++#endif + #define SCHED_IDLE 5 + #define SCHED_DEADLINE 6 + +diff --git a/init/Kconfig b/init/Kconfig +index b4daad2bac23..ee3b9957cf3b 100644 +--- a/init/Kconfig ++++ b/init/Kconfig +@@ -73,6 +73,21 @@ config THREAD_INFO_IN_TASK + + menu "General setup" + ++config SCHED_PDS ++ bool "PDS-mq cpu scheduler" ++ help ++ The Priority and Deadline based Skip list multiple queue CPU ++ Scheduler for excellent interactivity and responsiveness on the ++ desktop and solid scalability on normal hardware and commodity ++ servers. ++ ++ Currently incompatible with the Group CPU scheduler, and RCU TORTURE ++ TEST so these options are disabled. ++ ++ Say Y here. ++ default y ++ ++ + config BROKEN + bool + +@@ -802,6 +817,7 @@ config NUMA_BALANCING + depends on ARCH_SUPPORTS_NUMA_BALANCING + depends on !ARCH_WANT_NUMA_VARIABLE_LOCALITY + depends on SMP && NUMA && MIGRATION ++ depends on !SCHED_PDS + help + This option adds support for automatic NUMA aware memory/task placement. + The mechanism is quite primitive and is based on migrating memory when +@@ -903,7 +919,7 @@ menuconfig CGROUP_SCHED + bandwidth allocation to such task groups. It uses cgroups to group + tasks. + +-if CGROUP_SCHED ++if CGROUP_SCHED && !SCHED_PDS + config FAIR_GROUP_SCHED + bool "Group scheduling for SCHED_OTHER" + depends on CGROUP_SCHED +@@ -1032,6 +1048,7 @@ config CGROUP_DEVICE + + config CGROUP_CPUACCT + bool "Simple CPU accounting controller" ++ depends on !SCHED_PDS + help + Provides a simple controller for monitoring the + total CPU consumed by the tasks in a cgroup. +@@ -1150,6 +1167,7 @@ config CHECKPOINT_RESTORE + + config SCHED_AUTOGROUP + bool "Automatic process group scheduling" ++ depends on !SCHED_PDS + select CGROUPS + select CGROUP_SCHED + select FAIR_GROUP_SCHED +diff --git a/init/init_task.c b/init/init_task.c +index 9e5cbe5eab7b..89787e2feb60 100644 +--- a/init/init_task.c ++++ b/init/init_task.c +@@ -58,6 +58,126 @@ struct task_struct init_task + __init_task_data + #endif + = { ++#ifdef CONFIG_SCHED_PDS ++#ifdef CONFIG_THREAD_INFO_IN_TASK ++ .thread_info = INIT_THREAD_INFO(init_task), ++ .stack_refcount = ATOMIC_INIT(1), ++#endif ++ .state = 0, ++ .stack = init_stack, ++ .usage = ATOMIC_INIT(2), ++ .flags = PF_KTHREAD, ++ .prio = NORMAL_PRIO, ++ .static_prio = MAX_PRIO - 20, ++ .normal_prio = NORMAL_PRIO, ++ .deadline = 0, /* PDS only */ ++ .policy = SCHED_NORMAL, ++ .cpus_ptr = &init_task.cpus_mask, ++ .cpus_mask = CPU_MASK_ALL, ++ .nr_cpus_allowed= NR_CPUS, ++ .mm = NULL, ++ .active_mm = &init_mm, ++ .restart_block = { ++ .fn = do_no_restart_syscall, ++ }, ++ .sl_level = 0, /* PDS only */ ++ .sl_node = SKIPLIST_NODE_INIT(init_task.sl_node), /* PDS only */ ++ .time_slice = HZ, /* PDS only */ ++ .tasks = LIST_HEAD_INIT(init_task.tasks), ++#ifdef CONFIG_SMP ++ .pushable_tasks = PLIST_NODE_INIT(init_task.pushable_tasks, MAX_PRIO), ++#endif ++#ifdef CONFIG_CGROUP_SCHED ++ .sched_task_group = &root_task_group, ++#endif ++ .ptraced = LIST_HEAD_INIT(init_task.ptraced), ++ .ptrace_entry = LIST_HEAD_INIT(init_task.ptrace_entry), ++ .real_parent = &init_task, ++ .parent = &init_task, ++ .children = LIST_HEAD_INIT(init_task.children), ++ .sibling = LIST_HEAD_INIT(init_task.sibling), ++ .group_leader = &init_task, ++ RCU_POINTER_INITIALIZER(real_cred, &init_cred), ++ RCU_POINTER_INITIALIZER(cred, &init_cred), ++ .comm = INIT_TASK_COMM, ++ .thread = INIT_THREAD, ++ .fs = &init_fs, ++ .files = &init_files, ++ .signal = &init_signals, ++ .sighand = &init_sighand, ++ .nsproxy = &init_nsproxy, ++ .pending = { ++ .list = LIST_HEAD_INIT(init_task.pending.list), ++ .signal = {{0}} ++ }, ++ .blocked = {{0}}, ++ .alloc_lock = __SPIN_LOCK_UNLOCKED(init_task.alloc_lock), ++ .journal_info = NULL, ++ INIT_CPU_TIMERS(init_task) ++ .pi_lock = __RAW_SPIN_LOCK_UNLOCKED(init_task.pi_lock), ++ .timer_slack_ns = 50000, /* 50 usec default slack */ ++ .thread_pid = &init_struct_pid, ++ .thread_group = LIST_HEAD_INIT(init_task.thread_group), ++ .thread_node = LIST_HEAD_INIT(init_signals.thread_head), ++#ifdef CONFIG_AUDITSYSCALL ++ .loginuid = INVALID_UID, ++ .sessionid = AUDIT_SID_UNSET, ++#endif ++#ifdef CONFIG_PERF_EVENTS ++ .perf_event_mutex = __MUTEX_INITIALIZER(init_task.perf_event_mutex), ++ .perf_event_list = LIST_HEAD_INIT(init_task.perf_event_list), ++#endif ++#ifdef CONFIG_PREEMPT_RCU ++ .rcu_read_lock_nesting = 0, ++ .rcu_read_unlock_special.s = 0, ++ .rcu_node_entry = LIST_HEAD_INIT(init_task.rcu_node_entry), ++ .rcu_blocked_node = NULL, ++#endif ++#ifdef CONFIG_TASKS_RCU ++ .rcu_tasks_holdout = false, ++ .rcu_tasks_holdout_list = LIST_HEAD_INIT(init_task.rcu_tasks_holdout_list), ++ .rcu_tasks_idle_cpu = -1, ++#endif ++#ifdef CONFIG_CPUSETS ++ .mems_allowed_seq = SEQCNT_ZERO(init_task.mems_allowed_seq), ++#endif ++#ifdef CONFIG_RT_MUTEXES ++ .pi_waiters = RB_ROOT_CACHED, ++ .pi_top_task = NULL, ++#endif ++ INIT_PREV_CPUTIME(init_task) ++#ifdef CONFIG_VIRT_CPU_ACCOUNTING_GEN ++ .vtime.seqcount = SEQCNT_ZERO(init_task.vtime_seqcount), ++ .vtime.starttime = 0, ++ .vtime.state = VTIME_SYS, ++#endif ++#ifdef CONFIG_NUMA_BALANCING ++ .numa_preferred_nid = -1, ++ .numa_group = NULL, ++ .numa_faults = NULL, ++#endif ++#ifdef CONFIG_KASAN ++ .kasan_depth = 1, ++#endif ++#ifdef CONFIG_TRACE_IRQFLAGS ++ .softirqs_enabled = 1, ++#endif ++#ifdef CONFIG_LOCKDEP ++ .lockdep_recursion = 0, ++#endif ++#ifdef CONFIG_FUNCTION_GRAPH_TRACER ++ .ret_stack = NULL, ++#endif ++#if defined(CONFIG_TRACING) && defined(CONFIG_PREEMPT) ++ .trace_recursion = 0, ++#endif ++#ifdef CONFIG_LIVEPATCH ++ .patch_state = KLP_UNDEFINED, ++#endif ++#ifdef CONFIG_SECURITY ++ .security = NULL, ++#endif ++#else /* CONFIG_SCHED_PDS */ + #ifdef CONFIG_THREAD_INFO_IN_TASK + .thread_info = INIT_THREAD_INFO(init_task), + .stack_refcount = REFCOUNT_INIT(1), +@@ -181,6 +301,7 @@ struct task_struct init_task + #ifdef CONFIG_SECURITY + .security = NULL, + #endif ++#endif /* CONFIG_SCHED_PDS */ + }; + EXPORT_SYMBOL(init_task); + +diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c +index c87ee6412b36..4045c8532027 100644 +--- a/kernel/cgroup/cpuset.c ++++ b/kernel/cgroup/cpuset.c +@@ -632,7 +632,7 @@ static int validate_change(struct cpuset *cur, struct cpuset *trial) + return ret; + } + +-#ifdef CONFIG_SMP ++#if defined(CONFIG_SMP) && !defined(CONFIG_SCHED_PDS) + /* + * Helper routine for generate_sched_domains(). + * Do cpusets a, b have overlapping effective cpus_allowed masks? +@@ -1007,7 +1007,7 @@ static void rebuild_sched_domains_locked(void) + /* Have scheduler rebuild the domains */ + partition_and_rebuild_sched_domains(ndoms, doms, attr); + } +-#else /* !CONFIG_SMP */ ++#else /* !CONFIG_SMP || CONFIG_SCHED_PDS */ + static void rebuild_sched_domains_locked(void) + { + } +diff --git a/kernel/delayacct.c b/kernel/delayacct.c +index 27725754ac99..769d773c7182 100644 +--- a/kernel/delayacct.c ++++ b/kernel/delayacct.c +@@ -106,7 +106,7 @@ int __delayacct_add_tsk(struct taskstats *d, struct task_struct *tsk) + */ + t1 = tsk->sched_info.pcount; + t2 = tsk->sched_info.run_delay; +- t3 = tsk->se.sum_exec_runtime; ++ t3 = tsk_seruntime(tsk); + + d->cpu_count += t1; + +diff --git a/kernel/exit.c b/kernel/exit.c +index a46a50d67002..58043176b285 100644 +--- a/kernel/exit.c ++++ b/kernel/exit.c +@@ -131,7 +131,7 @@ static void __exit_signal(struct task_struct *tsk) + sig->curr_target = next_thread(tsk); + } + +- add_device_randomness((const void*) &tsk->se.sum_exec_runtime, ++ add_device_randomness((const void*) &tsk_seruntime(tsk), + sizeof(unsigned long long)); + + /* +@@ -152,7 +152,7 @@ static void __exit_signal(struct task_struct *tsk) + sig->inblock += task_io_get_inblock(tsk); + sig->oublock += task_io_get_oublock(tsk); + task_io_accounting_add(&sig->ioac, &tsk->ioac); +- sig->sum_sched_runtime += tsk->se.sum_exec_runtime; ++ sig->sum_sched_runtime += tsk_seruntime(tsk); + sig->nr_threads--; + __unhash_process(tsk, group_dead); + write_sequnlock(&sig->stats_lock); +diff --git a/kernel/livepatch/transition.c b/kernel/livepatch/transition.c +index cdf318d86dd6..baa525865d5c 100644 +--- a/kernel/livepatch/transition.c ++++ b/kernel/livepatch/transition.c +@@ -306,7 +306,11 @@ static bool klp_try_switch_task(struct task_struct *task) + */ + rq = task_rq_lock(task, &flags); + ++#ifdef CONFIG_SCHED_PDS ++ if (task_running(task) && task != current) { ++#else + if (task_running(rq, task) && task != current) { ++#endif + snprintf(err_buf, STACK_ERR_BUF_SIZE, + "%s: %s:%d is running\n", __func__, task->comm, + task->pid); +diff --git a/kernel/locking/rtmutex.c b/kernel/locking/rtmutex.c +index 2874bf556162..fad8a279fdfa 100644 +--- a/kernel/locking/rtmutex.c ++++ b/kernel/locking/rtmutex.c +@@ -229,7 +229,7 @@ static inline bool unlock_rt_mutex_safe(struct rt_mutex *lock, + * Only use with rt_mutex_waiter_{less,equal}() + */ + #define task_to_waiter(p) \ +- &(struct rt_mutex_waiter){ .prio = (p)->prio, .deadline = (p)->dl.deadline } ++ &(struct rt_mutex_waiter){ .prio = (p)->prio, .deadline = __tsk_deadline(p) } + + static inline int + rt_mutex_waiter_less(struct rt_mutex_waiter *left, +@@ -680,7 +680,7 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task, + * the values of the node being removed. + */ + waiter->prio = task->prio; +- waiter->deadline = task->dl.deadline; ++ waiter->deadline = __tsk_deadline(task); + + rt_mutex_enqueue(lock, waiter); + +@@ -953,7 +953,7 @@ static int task_blocks_on_rt_mutex(struct rt_mutex *lock, + waiter->task = task; + waiter->lock = lock; + waiter->prio = task->prio; +- waiter->deadline = task->dl.deadline; ++ waiter->deadline = __tsk_deadline(task); + + /* Get the top priority waiter on the lock */ + if (rt_mutex_has_waiters(lock)) +diff --git a/kernel/sched/Makefile b/kernel/sched/Makefile +index 21fb5a5662b5..8ebe4e33fb5f 100644 +--- a/kernel/sched/Makefile ++++ b/kernel/sched/Makefile +@@ -16,15 +16,21 @@ ifneq ($(CONFIG_SCHED_OMIT_FRAME_POINTER),y) + CFLAGS_core.o := $(PROFILING) -fno-omit-frame-pointer + endif + +-obj-y += core.o loadavg.o clock.o cputime.o +-obj-y += idle.o fair.o rt.o deadline.o +-obj-y += wait.o wait_bit.o swait.o completion.o +- +-obj-$(CONFIG_SMP) += cpupri.o cpudeadline.o topology.o stop_task.o pelt.o ++ifdef CONFIG_SCHED_PDS ++obj-y += pds.o ++else ++obj-y += core.o ++obj-y += fair.o rt.o deadline.o ++obj-$(CONFIG_SMP) += cpudeadline.o topology.o stop_task.o + obj-$(CONFIG_SCHED_AUTOGROUP) += autogroup.o +-obj-$(CONFIG_SCHEDSTATS) += stats.o + obj-$(CONFIG_SCHED_DEBUG) += debug.o + obj-$(CONFIG_CGROUP_CPUACCT) += cpuacct.o ++endif ++obj-y += loadavg.o clock.o cputime.o ++obj-y += idle.o ++obj-y += wait.o wait_bit.o swait.o completion.o ++obj-$(CONFIG_SMP) += cpupri.o pelt.o ++obj-$(CONFIG_SCHEDSTATS) += stats.o + obj-$(CONFIG_CPU_FREQ) += cpufreq.o + obj-$(CONFIG_CPU_FREQ_GOV_SCHEDUTIL) += cpufreq_schedutil.o + obj-$(CONFIG_MEMBARRIER) += membarrier.o +diff --git a/kernel/sched/cpufreq_schedutil.c b/kernel/sched/cpufreq_schedutil.c +index 86800b4d5453..07f278dc3137 100644 +--- a/kernel/sched/cpufreq_schedutil.c ++++ b/kernel/sched/cpufreq_schedutil.c +@@ -185,6 +185,7 @@ static unsigned int get_next_freq(struct sugov_policy *sg_policy, + return cpufreq_driver_resolve_freq(policy, freq); + } + ++#ifndef CONFIG_SCHED_PDS + /* + * This function computes an effective utilization for the given CPU, to be + * used for frequency selection given the linear relation: f = u * f_max. +@@ -302,6 +303,13 @@ static unsigned long sugov_get_util(struct sugov_cpu *sg_cpu) + + return schedutil_cpu_util(sg_cpu->cpu, util, max, FREQUENCY_UTIL, NULL); + } ++#else /* CONFIG_SCHED_PDS */ ++static unsigned long sugov_get_util(struct sugov_cpu *sg_cpu) ++{ ++ sg_cpu->max = arch_scale_cpu_capacity(sg_cpu->cpu); ++ return sg_cpu->max; ++} ++#endif + + /** + * sugov_iowait_reset() - Reset the IO boost status of a CPU. +@@ -445,7 +453,9 @@ static inline bool sugov_cpu_is_busy(struct sugov_cpu *sg_cpu) { return false; } + */ + static inline void ignore_dl_rate_limit(struct sugov_cpu *sg_cpu, struct sugov_policy *sg_policy) + { ++#ifndef CONFIG_SCHED_PDS + if (cpu_bw_dl(cpu_rq(sg_cpu->cpu)) > sg_cpu->bw_dl) ++#endif + sg_policy->limits_changed = true; + } + +@@ -688,6 +698,7 @@ static int sugov_kthread_create(struct sugov_policy *sg_policy) + } + + ret = sched_setattr_nocheck(thread, &attr); ++ + if (ret) { + kthread_stop(thread); + pr_warn("%s: failed to set SCHED_DEADLINE\n", __func__); +@@ -918,6 +929,7 @@ static int __init sugov_register(void) + fs_initcall(sugov_register); + + #ifdef CONFIG_ENERGY_MODEL ++#ifndef CONFIG_SCHED_PDS + extern bool sched_energy_update; + extern struct mutex sched_energy_mutex; + +@@ -948,4 +960,10 @@ void sched_cpufreq_governor_change(struct cpufreq_policy *policy, + } + + } ++#else /* CONFIG_SCHED_PDS */ ++void sched_cpufreq_governor_change(struct cpufreq_policy *policy, ++ struct cpufreq_governor *old_gov) ++{ ++} ++#endif + #endif +diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c +index 46ed4e1383e2..0a9548ee995c 100644 +--- a/kernel/sched/cputime.c ++++ b/kernel/sched/cputime.c +@@ -122,7 +122,12 @@ void account_user_time(struct task_struct *p, u64 cputime) + p->utime += cputime; + account_group_user_time(p, cputime); + ++#ifdef CONFIG_SCHED_PDS ++ index = (task_nice(p) > 0 || task_running_idle(p)) ? CPUTIME_NICE : ++ CPUTIME_USER; ++#else + index = (task_nice(p) > 0) ? CPUTIME_NICE : CPUTIME_USER; ++#endif + + /* Add user time to cpustat. */ + task_group_account_field(p, index, cputime); +@@ -146,7 +151,11 @@ void account_guest_time(struct task_struct *p, u64 cputime) + p->gtime += cputime; + + /* Add guest time to cpustat. */ ++#ifdef CONFIG_SCHED_PDS ++ if (task_nice(p) > 0 || task_running_idle(p)) { ++#else + if (task_nice(p) > 0) { ++#endif + cpustat[CPUTIME_NICE] += cputime; + cpustat[CPUTIME_GUEST_NICE] += cputime; + } else { +@@ -269,7 +278,7 @@ static inline u64 account_other_time(u64 max) + #ifdef CONFIG_64BIT + static inline u64 read_sum_exec_runtime(struct task_struct *t) + { +- return t->se.sum_exec_runtime; ++ return tsk_seruntime(t); + } + #else + static u64 read_sum_exec_runtime(struct task_struct *t) +@@ -279,7 +288,7 @@ static u64 read_sum_exec_runtime(struct task_struct *t) + struct rq *rq; + + rq = task_rq_lock(t, &rf); +- ns = t->se.sum_exec_runtime; ++ ns = tsk_seruntime(t); + task_rq_unlock(rq, t, &rf); + + return ns; +@@ -663,7 +672,7 @@ void cputime_adjust(struct task_cputime *curr, struct prev_cputime *prev, + void task_cputime_adjusted(struct task_struct *p, u64 *ut, u64 *st) + { + struct task_cputime cputime = { +- .sum_exec_runtime = p->se.sum_exec_runtime, ++ .sum_exec_runtime = tsk_seruntime(p), + }; + + task_cputime(p, &cputime.utime, &cputime.stime); +diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c +index f65ef1e2f204..454fa7e460e3 100644 +--- a/kernel/sched/idle.c ++++ b/kernel/sched/idle.c +@@ -355,6 +355,7 @@ void cpu_startup_entry(enum cpuhp_state state) + do_idle(); + } + ++#ifndef CONFIG_SCHED_PDS + /* + * idle-task scheduling class. + */ +@@ -479,3 +480,4 @@ const struct sched_class idle_sched_class = { + .switched_to = switched_to_idle, + .update_curr = update_curr_idle, + }; ++#endif +diff --git a/kernel/sched/pds.c b/kernel/sched/pds.c +new file mode 100644 +index 000000000000..aefbd9cebcfb +--- /dev/null ++++ b/kernel/sched/pds.c +@@ -0,0 +1,6541 @@ ++/* ++ * kernel/sched/pds.c, was kernel/sched.c ++ * ++ * PDS-mq Core kernel scheduler code and related syscalls ++ * ++ * Copyright (C) 1991-2002 Linus Torvalds ++ * ++ * 2009-08-13 Brainfuck deadline scheduling policy by Con Kolivas deletes ++ * a whole lot of those previous things. ++ * 2017-09-06 Priority and Deadline based Skip list multiple queue kernel ++ * scheduler by Alfred Chen. ++ */ ++#include "pds_sched.h" ++ ++#include ++ ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++ ++#include ++ ++#include ++ ++#include "../workqueue_internal.h" ++#include "../../fs/io-wq.h" ++#include "../smpboot.h" ++ ++#include "pelt.h" ++ ++#define CREATE_TRACE_POINTS ++#include ++ ++ ++#define rt_prio(prio) ((prio) < MAX_RT_PRIO) ++#define rt_task(p) rt_prio((p)->prio) ++#define rt_policy(policy) ((policy) == SCHED_FIFO || \ ++ (policy) == SCHED_RR || \ ++ (policy) == SCHED_ISO) ++#define task_has_rt_policy(p) (rt_policy((p)->policy)) ++ ++#define idle_policy(policy) ((policy) == SCHED_IDLE) ++#define idleprio_task(p) unlikely(idle_policy((p)->policy)) ++ ++#define STOP_PRIO (MAX_RT_PRIO - 1) ++ ++/* ++ * Some helpers for converting to/from various scales. Use shifts to get ++ * approximate multiples of ten for less overhead. ++ */ ++#define JIFFIES_TO_NS(TIME) ((TIME) * (1000000000 / HZ)) ++#define JIFFY_NS (1000000000 / HZ) ++#define HALF_JIFFY_NS (1000000000 / HZ / 2) ++#define HALF_JIFFY_US (1000000 / HZ / 2) ++#define MS_TO_NS(TIME) ((TIME) << 20) ++#define MS_TO_US(TIME) ((TIME) << 10) ++#define NS_TO_MS(TIME) ((TIME) >> 20) ++#define NS_TO_US(TIME) ((TIME) >> 10) ++#define US_TO_NS(TIME) ((TIME) << 10) ++ ++#define RESCHED_US (100) /* Reschedule if less than this many μs left */ ++ ++enum { ++ BASE_CPU_AFFINITY_CHK_LEVEL = 1, ++#ifdef CONFIG_SCHED_SMT ++ SMT_CPU_AFFINITY_CHK_LEVEL_SPACE_HOLDER, ++#endif ++#ifdef CONFIG_SCHED_MC ++ MC_CPU_AFFINITY_CHK_LEVEL_SPACE_HOLDER, ++#endif ++ NR_CPU_AFFINITY_CHK_LEVEL ++}; ++ ++static inline void print_scheduler_version(void) ++{ ++ printk(KERN_INFO "pds: PDS-mq CPU Scheduler 0.99o by Alfred Chen and kept alive artificially by Tk-Glitch.\n"); ++} ++ ++/* ++ * This is the time all tasks within the same priority round robin. ++ * Value is in ms and set to a minimum of 6ms. Scales with number of cpus. ++ * Tunable via /proc interface. ++ */ ++#define SCHED_DEFAULT_RR (4) ++int rr_interval __read_mostly = SCHED_DEFAULT_RR; ++ ++static int __init rr_interval_set(char *str) ++{ ++ u32 rr; ++ ++ pr_info("rr_interval: "); ++ if (kstrtouint(str, 0, &rr)) { ++ pr_cont("using default of %u, unable to parse %s\n", ++ rr_interval, str); ++ return 1; ++ } ++ ++ rr_interval = rr; ++ pr_cont("%d\n", rr_interval); ++ ++ return 1; ++} ++__setup("rr_interval=", rr_interval_set); ++ ++ ++static const u64 sched_prio2deadline[NICE_WIDTH] = { ++/* -20 */ 6291456, 6920601, 7612661, 8373927, 9211319, ++/* -15 */ 10132450, 11145695, 12260264, 13486290, 14834919, ++/* -10 */ 16318410, 17950251, 19745276, 21719803, 23891783, ++/* -5 */ 26280961, 28909057, 31799962, 34979958, 38477953, ++/* 0 */ 42325748, 46558322, 51214154, 56335569, 61969125, ++/* 5 */ 68166037, 74982640, 82480904, 90728994, 99801893, ++/* 10 */ 109782082, 120760290, 132836319, 146119950, 160731945, ++/* 15 */ 176805139, 194485652, 213934217, 235327638, 258860401 ++}; ++ ++/** ++ * sched_yield_type - Choose what sort of yield sched_yield will perform. ++ * 0: No yield. ++ * 1: Yield only to better priority/deadline tasks. (default) ++ * 2: Expire timeslice and recalculate deadline. ++ */ ++int sched_yield_type __read_mostly = 1; ++ ++/* ++ * The quota handed out to tasks of all priority levels when refilling their ++ * time_slice. ++ */ ++static inline int timeslice(void) ++{ ++ return MS_TO_US(rr_interval); ++} ++ ++#ifdef CONFIG_SMP ++enum { ++SCHED_RQ_EMPTY = 0, ++SCHED_RQ_IDLE, ++SCHED_RQ_NORMAL_0, ++SCHED_RQ_NORMAL_1, ++SCHED_RQ_NORMAL_2, ++SCHED_RQ_NORMAL_3, ++SCHED_RQ_NORMAL_4, ++SCHED_RQ_NORMAL_5, ++SCHED_RQ_NORMAL_6, ++SCHED_RQ_NORMAL_7, ++SCHED_RQ_ISO, ++SCHED_RQ_RT, ++NR_SCHED_RQ_QUEUED_LEVEL ++}; ++ ++static cpumask_t sched_rq_queued_masks[NR_SCHED_RQ_QUEUED_LEVEL] ++____cacheline_aligned_in_smp; ++ ++static DECLARE_BITMAP(sched_rq_queued_masks_bitmap, NR_SCHED_RQ_QUEUED_LEVEL) ++____cacheline_aligned_in_smp; ++ ++static cpumask_t sched_rq_pending_masks[NR_SCHED_RQ_QUEUED_LEVEL] ++____cacheline_aligned_in_smp; ++ ++static DECLARE_BITMAP(sched_rq_pending_masks_bitmap, NR_SCHED_RQ_QUEUED_LEVEL) ++____cacheline_aligned_in_smp; ++ ++DEFINE_PER_CPU(cpumask_t [NR_CPU_AFFINITY_CHK_LEVEL], sched_cpu_affinity_chk_masks); ++DEFINE_PER_CPU(cpumask_t *, sched_cpu_llc_start_mask); ++DEFINE_PER_CPU(cpumask_t *, sched_cpu_affinity_chk_end_masks); ++ ++#ifdef CONFIG_SCHED_SMT ++DEFINE_PER_CPU(int, sched_sibling_cpu); ++DEFINE_STATIC_KEY_FALSE(sched_smt_present); ++EXPORT_SYMBOL_GPL(sched_smt_present); ++ ++static cpumask_t sched_cpu_sg_idle_mask ____cacheline_aligned_in_smp; ++ ++#ifdef CONFIG_SMT_NICE ++/* ++ * Preemptible sibling group mask ++ * Which all sibling cpus are running at PRIO_LIMIT or IDLE_PRIO ++ */ ++static cpumask_t sched_cpu_psg_mask ____cacheline_aligned_in_smp; ++/* ++ * SMT supressed mask ++ * When a cpu is running task with NORMAL/ISO/RT policy, its sibling cpu ++ * will be supressed to run IDLE priority task. ++ */ ++static cpumask_t sched_smt_supressed_mask ____cacheline_aligned_in_smp; ++#endif /* CONFIG_SMT_NICE */ ++#endif ++ ++static int sched_rq_prio[NR_CPUS] ____cacheline_aligned; ++ ++/* ++ * Keep a unique ID per domain (we use the first CPUs number in the cpumask of ++ * the domain), this allows us to quickly tell if two cpus are in the same cache ++ * domain, see cpus_share_cache(). ++ */ ++DEFINE_PER_CPU(int, sd_llc_id); ++ ++int __weak arch_sd_sibling_asym_packing(void) ++{ ++ return 0*SD_ASYM_PACKING; ++} ++#else ++struct rq *uprq; ++#endif /* CONFIG_SMP */ ++ ++static DEFINE_MUTEX(sched_hotcpu_mutex); ++ ++DEFINE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues); ++ ++#ifndef prepare_arch_switch ++# define prepare_arch_switch(next) do { } while (0) ++#endif ++#ifndef finish_arch_post_lock_switch ++# define finish_arch_post_lock_switch() do { } while (0) ++#endif ++ ++/* ++ * Context: p->pi_lock ++ */ ++static inline struct rq ++*__task_access_lock(struct task_struct *p, raw_spinlock_t **plock) ++{ ++ struct rq *rq; ++ for (;;) { ++ rq = task_rq(p); ++ if (p->on_cpu || task_on_rq_queued(p)) { ++ raw_spin_lock(&rq->lock); ++ if (likely((p->on_cpu || task_on_rq_queued(p)) ++ && rq == task_rq(p))) { ++ *plock = &rq->lock; ++ return rq; ++ } ++ raw_spin_unlock(&rq->lock); ++ } else if (task_on_rq_migrating(p)) { ++ do { ++ cpu_relax(); ++ } while (unlikely(task_on_rq_migrating(p))); ++ } else { ++ *plock = NULL; ++ return rq; ++ } ++ } ++} ++ ++static inline void ++__task_access_unlock(struct task_struct *p, raw_spinlock_t *lock) ++{ ++ if (NULL != lock) ++ raw_spin_unlock(lock); ++} ++ ++static inline struct rq ++*task_access_lock_irqsave(struct task_struct *p, raw_spinlock_t **plock, ++ unsigned long *flags) ++{ ++ struct rq *rq; ++ for (;;) { ++ rq = task_rq(p); ++ if (p->on_cpu || task_on_rq_queued(p)) { ++ raw_spin_lock_irqsave(&rq->lock, *flags); ++ if (likely((p->on_cpu || task_on_rq_queued(p)) ++ && rq == task_rq(p))) { ++ *plock = &rq->lock; ++ return rq; ++ } ++ raw_spin_unlock_irqrestore(&rq->lock, *flags); ++ } else if (task_on_rq_migrating(p)) { ++ do { ++ cpu_relax(); ++ } while (unlikely(task_on_rq_migrating(p))); ++ } else { ++ raw_spin_lock_irqsave(&p->pi_lock, *flags); ++ if (likely(!p->on_cpu && !p->on_rq && ++ rq == task_rq(p))) { ++ *plock = &p->pi_lock; ++ return rq; ++ } ++ raw_spin_unlock_irqrestore(&p->pi_lock, *flags); ++ } ++ } ++} ++ ++static inline void ++task_access_unlock_irqrestore(struct task_struct *p, raw_spinlock_t *lock, ++ unsigned long *flags) ++{ ++ raw_spin_unlock_irqrestore(lock, *flags); ++} ++ ++/* ++ * __task_rq_lock - lock the rq @p resides on. ++ */ ++struct rq *__task_rq_lock(struct task_struct *p, struct rq_flags *rf) ++ __acquires(rq->lock) ++{ ++ struct rq *rq; ++ ++ lockdep_assert_held(&p->pi_lock); ++ ++ for (;;) { ++ rq = task_rq(p); ++ raw_spin_lock(&rq->lock); ++ if (likely(rq == task_rq(p) && !task_on_rq_migrating(p))) ++ return rq; ++ raw_spin_unlock(&rq->lock); ++ ++ while (unlikely(task_on_rq_migrating(p))) ++ cpu_relax(); ++ } ++} ++ ++/* ++ * task_rq_lock - lock p->pi_lock and lock the rq @p resides on. ++ */ ++struct rq *task_rq_lock(struct task_struct *p, struct rq_flags *rf) ++ __acquires(p->pi_lock) ++ __acquires(rq->lock) ++{ ++ struct rq *rq; ++ ++ for (;;) { ++ raw_spin_lock_irqsave(&p->pi_lock, rf->flags); ++ rq = task_rq(p); ++ raw_spin_lock(&rq->lock); ++ /* ++ * move_queued_task() task_rq_lock() ++ * ++ * ACQUIRE (rq->lock) ++ * [S] ->on_rq = MIGRATING [L] rq = task_rq() ++ * WMB (__set_task_cpu()) ACQUIRE (rq->lock); ++ * [S] ->cpu = new_cpu [L] task_rq() ++ * [L] ->on_rq ++ * RELEASE (rq->lock) ++ * ++ * If we observe the old CPU in task_rq_lock(), the acquire of ++ * the old rq->lock will fully serialize against the stores. ++ * ++ * If we observe the new CPU in task_rq_lock(), the address ++ * dependency headed by '[L] rq = task_rq()' and the acquire ++ * will pair with the WMB to ensure we then also see migrating. ++ */ ++ if (likely(rq == task_rq(p) && !task_on_rq_migrating(p))) { ++ return rq; ++ } ++ raw_spin_unlock(&rq->lock); ++ raw_spin_unlock_irqrestore(&p->pi_lock, rf->flags); ++ ++ while (unlikely(task_on_rq_migrating(p))) ++ cpu_relax(); ++ } ++} ++ ++/* ++ * RQ-clock updating methods: ++ */ ++ ++static void update_rq_clock_task(struct rq *rq, s64 delta) ++{ ++/* ++ * In theory, the compile should just see 0 here, and optimize out the call ++ * to sched_rt_avg_update. But I don't trust it... ++ */ ++ s64 __maybe_unused steal = 0, irq_delta = 0; ++ ++#ifdef CONFIG_IRQ_TIME_ACCOUNTING ++ irq_delta = irq_time_read(cpu_of(rq)) - rq->prev_irq_time; ++ ++ /* ++ * Since irq_time is only updated on {soft,}irq_exit, we might run into ++ * this case when a previous update_rq_clock() happened inside a ++ * {soft,}irq region. ++ * ++ * When this happens, we stop ->clock_task and only update the ++ * prev_irq_time stamp to account for the part that fit, so that a next ++ * update will consume the rest. This ensures ->clock_task is ++ * monotonic. ++ * ++ * It does however cause some slight miss-attribution of {soft,}irq ++ * time, a more accurate solution would be to update the irq_time using ++ * the current rq->clock timestamp, except that would require using ++ * atomic ops. ++ */ ++ if (irq_delta > delta) ++ irq_delta = delta; ++ ++ rq->prev_irq_time += irq_delta; ++ delta -= irq_delta; ++#endif ++#ifdef CONFIG_PARAVIRT_TIME_ACCOUNTING ++ if (static_key_false((¶virt_steal_rq_enabled))) { ++ steal = paravirt_steal_clock(cpu_of(rq)); ++ steal -= rq->prev_steal_time_rq; ++ ++ if (unlikely(steal > delta)) ++ steal = delta; ++ ++ rq->prev_steal_time_rq += steal; ++ ++ delta -= steal; ++ } ++#endif ++ ++ rq->clock_task += delta; ++ ++#ifdef CONFIG_HAVE_SCHED_AVG_IRQ ++ if ((irq_delta + steal)) ++ update_irq_load_avg(rq, irq_delta + steal); ++#endif ++} ++ ++static inline void update_rq_clock(struct rq *rq) ++{ ++ s64 delta = sched_clock_cpu(cpu_of(rq)) - rq->clock; ++ ++ if (unlikely(delta <= 0)) ++ return; ++ rq->clock += delta; ++ update_rq_clock_task(rq, delta); ++} ++ ++static inline void update_task_priodl(struct task_struct *p) ++{ ++ p->priodl = (((u64) (p->prio))<<56) | ((p->deadline)>>8); ++} ++ ++/* ++ * Deadline is "now" in niffies + (offset by priority). Setting the deadline ++ * is the key to everything. It distributes CPU fairly amongst tasks of the ++ * same nice value, it proportions CPU according to nice level, it means the ++ * task that last woke up the longest ago has the earliest deadline, thus ++ * ensuring that interactive tasks get low latency on wake up. The CPU ++ * proportion works out to the square of the virtual deadline difference, so ++ * this equation will give nice 19 3% CPU compared to nice 0. ++ */ ++static inline u64 task_deadline_diff(const struct task_struct *p) ++{ ++ return sched_prio2deadline[TASK_USER_PRIO(p)]; ++} ++ ++static inline u64 static_deadline_diff(int static_prio) ++{ ++ return sched_prio2deadline[USER_PRIO(static_prio)]; ++} ++ ++/* ++ * The time_slice is only refilled when it is empty and that is when we set a ++ * new deadline for non-rt tasks. ++ */ ++static inline void time_slice_expired(struct task_struct *p, struct rq *rq) ++{ ++ p->time_slice = timeslice(); ++ if (p->prio >= NORMAL_PRIO) ++ p->deadline = rq->clock + task_deadline_diff(p); ++ ++ update_task_priodl(p); ++} ++ ++static inline struct task_struct *rq_first_queued_task(struct rq *rq) ++{ ++ struct skiplist_node *node = rq->sl_header.next[0]; ++ ++ if (node == &rq->sl_header) ++ return rq->idle; ++ ++ return skiplist_entry(node, struct task_struct, sl_node); ++} ++ ++static inline struct task_struct *rq_second_queued_task(struct rq *rq) ++{ ++ struct skiplist_node *node = rq->sl_header.next[0]->next[0]; ++ ++ if (node == &rq->sl_header) ++ return rq->idle; ++ ++ return skiplist_entry(node, struct task_struct, sl_node); ++} ++ ++static inline int is_second_in_rq(struct task_struct *p, struct rq *rq) ++{ ++ return (p->sl_node.prev[0]->prev[0] == &rq->sl_header); ++} ++ ++static const int task_dl_hash_tbl[] = { ++/* 0 4 8 12 */ ++ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, ++/* 16 20 24 28 */ ++ 1, 1, 1, 2, 2, 2, 2, 2, 3, 3, 3, 4, 4, 5, 6, 7 ++}; ++ ++static inline int ++task_deadline_level(const struct task_struct *p, const struct rq *rq) ++{ ++ u64 delta = (rq->clock + sched_prio2deadline[39] - p->deadline) >> 23; ++ ++ delta = min((size_t)delta, ARRAY_SIZE(task_dl_hash_tbl) - 1); ++ return task_dl_hash_tbl[delta]; ++} ++ ++/* ++ * cmpxchg based fetch_or, macro so it works for different integer types ++ */ ++#define fetch_or(ptr, mask) \ ++ ({ \ ++ typeof(ptr) _ptr = (ptr); \ ++ typeof(mask) _mask = (mask); \ ++ typeof(*_ptr) _old, _val = *_ptr; \ ++ \ ++ for (;;) { \ ++ _old = cmpxchg(_ptr, _val, _val | _mask); \ ++ if (_old == _val) \ ++ break; \ ++ _val = _old; \ ++ } \ ++ _old; \ ++}) ++ ++#if defined(CONFIG_SMP) && defined(TIF_POLLING_NRFLAG) ++/* ++ * Atomically set TIF_NEED_RESCHED and test for TIF_POLLING_NRFLAG, ++ * this avoids any races wrt polling state changes and thereby avoids ++ * spurious IPIs. ++ */ ++static bool set_nr_and_not_polling(struct task_struct *p) ++{ ++ struct thread_info *ti = task_thread_info(p); ++ return !(fetch_or(&ti->flags, _TIF_NEED_RESCHED) & _TIF_POLLING_NRFLAG); ++} ++ ++/* ++ * Atomically set TIF_NEED_RESCHED if TIF_POLLING_NRFLAG is set. ++ * ++ * If this returns true, then the idle task promises to call ++ * sched_ttwu_pending() and reschedule soon. ++ */ ++static bool set_nr_if_polling(struct task_struct *p) ++{ ++ struct thread_info *ti = task_thread_info(p); ++ typeof(ti->flags) old, val = READ_ONCE(ti->flags); ++ ++ for (;;) { ++ if (!(val & _TIF_POLLING_NRFLAG)) ++ return false; ++ if (val & _TIF_NEED_RESCHED) ++ return true; ++ old = cmpxchg(&ti->flags, val, val | _TIF_NEED_RESCHED); ++ if (old == val) ++ break; ++ val = old; ++ } ++ return true; ++} ++ ++#else ++static bool set_nr_and_not_polling(struct task_struct *p) ++{ ++ set_tsk_need_resched(p); ++ return true; ++} ++ ++#ifdef CONFIG_SMP ++static bool set_nr_if_polling(struct task_struct *p) ++{ ++ return false; ++} ++#endif ++#endif ++ ++#ifdef CONFIG_SMP ++#ifdef CONFIG_SMT_NICE ++static void resched_cpu_if_curr_is(int cpu, int priority) ++{ ++ struct rq *rq = cpu_rq(cpu); ++ ++ rcu_read_lock(); ++ ++ if (rcu_dereference(rq->curr)->prio != priority) ++ goto out; ++ ++ if (set_nr_if_polling(rq->idle)) { ++ trace_sched_wake_idle_without_ipi(cpu); ++ } else { ++ if (!do_raw_spin_trylock(&rq->lock)) ++ goto out; ++ spin_acquire(&rq->lock.dep_map, SINGLE_DEPTH_NESTING, 1, _RET_IP_); ++ ++ if (priority == rq->curr->prio) ++ smp_send_reschedule(cpu); ++ /* Else CPU is not idle, do nothing here */ ++ ++ spin_release(&rq->lock.dep_map, _RET_IP_); ++ do_raw_spin_unlock(&rq->lock); ++ } ++ ++out: ++ rcu_read_unlock(); ++} ++#endif /* CONFIG_SMT_NICE */ ++ ++static inline bool ++__update_cpumasks_bitmap(int cpu, unsigned long *plevel, unsigned long level, ++ cpumask_t cpumasks[], unsigned long bitmap[]) ++{ ++ if (*plevel == level) ++ return false; ++ ++ cpumask_clear_cpu(cpu, cpumasks + *plevel); ++ if (cpumask_empty(cpumasks + *plevel)) ++ clear_bit(*plevel, bitmap); ++ cpumask_set_cpu(cpu, cpumasks + level); ++ set_bit(level, bitmap); ++ ++ *plevel = level; ++ ++ return true; ++} ++ ++static inline int ++task_running_policy_level(const struct task_struct *p, const struct rq *rq) ++{ ++ int prio = p->prio; ++ ++ if (NORMAL_PRIO == prio) ++ return SCHED_RQ_NORMAL_0 + task_deadline_level(p, rq); ++ ++ if (ISO_PRIO == prio) ++ return SCHED_RQ_ISO; ++ if (prio < MAX_RT_PRIO) ++ return SCHED_RQ_RT; ++ return PRIO_LIMIT - prio; ++} ++ ++static inline void update_sched_rq_queued_masks_normal(struct rq *rq) ++{ ++ struct task_struct *p = rq_first_queued_task(rq); ++ ++ if (p->prio != NORMAL_PRIO) ++ return; ++ ++ __update_cpumasks_bitmap(cpu_of(rq), &rq->queued_level, ++ task_running_policy_level(p, rq), ++ &sched_rq_queued_masks[0], ++ &sched_rq_queued_masks_bitmap[0]); ++} ++ ++#ifdef CONFIG_SMT_NICE ++static inline void update_sched_cpu_psg_mask(const int cpu) ++{ ++ cpumask_t tmp; ++ ++ cpumask_or(&tmp, &sched_rq_queued_masks[SCHED_RQ_EMPTY], ++ &sched_rq_queued_masks[SCHED_RQ_IDLE]); ++ cpumask_and(&tmp, &tmp, cpu_smt_mask(cpu)); ++ if (cpumask_equal(&tmp, cpu_smt_mask(cpu))) ++ cpumask_or(&sched_cpu_psg_mask, &sched_cpu_psg_mask, ++ cpu_smt_mask(cpu)); ++ else ++ cpumask_andnot(&sched_cpu_psg_mask, &sched_cpu_psg_mask, ++ cpu_smt_mask(cpu)); ++} ++#endif ++ ++static inline void update_sched_rq_queued_masks(struct rq *rq) ++{ ++ int cpu = cpu_of(rq); ++ struct task_struct *p = rq_first_queued_task(rq); ++ unsigned long level; ++#ifdef CONFIG_SCHED_SMT ++ unsigned long last_level = rq->queued_level; ++#endif ++ ++ level = task_running_policy_level(p, rq); ++ sched_rq_prio[cpu] = p->prio; ++ ++ if (!__update_cpumasks_bitmap(cpu, &rq->queued_level, level, ++ &sched_rq_queued_masks[0], ++ &sched_rq_queued_masks_bitmap[0])) ++ return; ++ ++#ifdef CONFIG_SCHED_SMT ++ if (cpu == per_cpu(sched_sibling_cpu, cpu)) ++ return; ++ ++ if (SCHED_RQ_EMPTY == last_level) { ++ cpumask_andnot(&sched_cpu_sg_idle_mask, &sched_cpu_sg_idle_mask, ++ cpu_smt_mask(cpu)); ++ } else if (SCHED_RQ_EMPTY == level) { ++ cpumask_t tmp; ++ ++ cpumask_and(&tmp, cpu_smt_mask(cpu), ++ &sched_rq_queued_masks[SCHED_RQ_EMPTY]); ++ if (cpumask_equal(&tmp, cpu_smt_mask(cpu))) ++ cpumask_or(&sched_cpu_sg_idle_mask, cpu_smt_mask(cpu), ++ &sched_cpu_sg_idle_mask); ++ } ++ ++#ifdef CONFIG_SMT_NICE ++ if (level <= SCHED_RQ_IDLE && last_level > SCHED_RQ_IDLE) { ++ cpumask_clear_cpu(per_cpu(sched_sibling_cpu, cpu), ++ &sched_smt_supressed_mask); ++ update_sched_cpu_psg_mask(cpu); ++ resched_cpu_if_curr_is(per_cpu(sched_sibling_cpu, cpu), PRIO_LIMIT); ++ } else if (last_level <= SCHED_RQ_IDLE && level > SCHED_RQ_IDLE) { ++ cpumask_set_cpu(per_cpu(sched_sibling_cpu, cpu), ++ &sched_smt_supressed_mask); ++ update_sched_cpu_psg_mask(cpu); ++ resched_cpu_if_curr_is(per_cpu(sched_sibling_cpu, cpu), IDLE_PRIO); ++ } ++#endif /* CONFIG_SMT_NICE */ ++#endif ++} ++ ++static inline void update_sched_rq_pending_masks(struct rq *rq) ++{ ++ unsigned long level; ++ struct task_struct *p = rq_second_queued_task(rq); ++ ++ level = task_running_policy_level(p, rq); ++ ++ __update_cpumasks_bitmap(cpu_of(rq), &rq->pending_level, level, ++ &sched_rq_pending_masks[0], ++ &sched_rq_pending_masks_bitmap[0]); ++} ++ ++#else /* CONFIG_SMP */ ++static inline void update_sched_rq_queued_masks(struct rq *rq) {} ++static inline void update_sched_rq_queued_masks_normal(struct rq *rq) {} ++static inline void update_sched_rq_pending_masks(struct rq *rq) {} ++#endif ++ ++#ifdef CONFIG_NO_HZ_FULL ++/* ++ * Tick may be needed by tasks in the runqueue depending on their policy and ++ * requirements. If tick is needed, lets send the target an IPI to kick it out ++ * of nohz mode if necessary. ++ */ ++static inline void sched_update_tick_dependency(struct rq *rq) ++{ ++ int cpu; ++ ++ if (!tick_nohz_full_enabled()) ++ return; ++ ++ cpu = cpu_of(rq); ++ ++ if (!tick_nohz_full_cpu(cpu)) ++ return; ++ ++ if (rq->nr_running < 2) ++ tick_nohz_dep_clear_cpu(cpu, TICK_DEP_BIT_SCHED); ++ else ++ tick_nohz_dep_set_cpu(cpu, TICK_DEP_BIT_SCHED); ++} ++#else /* !CONFIG_NO_HZ_FULL */ ++static inline void sched_update_tick_dependency(struct rq *rq) { } ++#endif ++ ++/* ++ * Removing from the runqueue. Deleting a task from the skip list is done ++ * via the stored node reference in the task struct and does not require a full ++ * look up. Thus it occurs in O(k) time where k is the "level" of the list the ++ * task was stored at - usually < 4, max 16. ++ * ++ * Context: rq->lock ++ */ ++static inline void dequeue_task(struct task_struct *p, struct rq *rq, int flags) ++{ ++ lockdep_assert_held(&rq->lock); ++ ++ WARN_ONCE(task_rq(p) != rq, "pds: dequeue task reside on cpu%d from cpu%d\n", ++ task_cpu(p), cpu_of(rq)); ++ if (skiplist_del_init(&rq->sl_header, &p->sl_node)) { ++ update_sched_rq_queued_masks(rq); ++ update_sched_rq_pending_masks(rq); ++ } else if (is_second_in_rq(p, rq)) ++ update_sched_rq_pending_masks(rq); ++ rq->nr_running--; ++ ++ sched_update_tick_dependency(rq); ++ psi_dequeue(p, flags & DEQUEUE_SLEEP); ++ ++ sched_info_dequeued(rq, p); ++} ++ ++/* ++ * To determine if it's safe for a task of SCHED_IDLE to actually run as ++ * an idle task, we ensure none of the following conditions are met. ++ */ ++static inline bool idleprio_suitable(struct task_struct *p) ++{ ++ return (!freezing(p) && !signal_pending(p) && ++ !(task_contributes_to_load(p)) && !(p->flags & (PF_EXITING))); ++} ++ ++/* ++ * pds_skiplist_random_level -- Returns a pseudo-random level number for skip ++ * list node which is used in PDS run queue. ++ * ++ * In current implementation, based on testing, the first 8 bits in microseconds ++ * of niffies are suitable for random level population. ++ * find_first_bit() is used to satisfy p = 0.5 between each levels, and there ++ * should be platform hardware supported instruction(known as ctz/clz) to speed ++ * up this function. ++ * The skiplist level for a task is populated when task is created and doesn't ++ * change in task's life time. When task is being inserted into run queue, this ++ * skiplist level is set to task's sl_node->level, the skiplist insert function ++ * may change it based on current level of the skip lsit. ++ */ ++static inline int pds_skiplist_random_level(const struct task_struct *p) ++{ ++ long unsigned int randseed; ++ ++ /* ++ * 1. Some architectures don't have better than microsecond resolution ++ * so mask out ~microseconds as a factor of the random seed for skiplist ++ * insertion. ++ * 2. Use address of task structure pointer as another factor of the ++ * random seed for task burst forking scenario. ++ */ ++ randseed = (task_rq(p)->clock ^ (long unsigned int)p) >> 10; ++ ++ return find_first_bit(&randseed, NUM_SKIPLIST_LEVEL - 1); ++} ++ ++/** ++ * pds_skiplist_task_search -- search function used in PDS run queue skip list ++ * node insert operation. ++ * @it: iterator pointer to the node in the skip list ++ * @node: pointer to the skiplist_node to be inserted ++ * ++ * Returns true if key of @it is less or equal to key value of @node, otherwise ++ * false. ++ */ ++static inline bool ++pds_skiplist_task_search(struct skiplist_node *it, struct skiplist_node *node) ++{ ++ return (skiplist_entry(it, struct task_struct, sl_node)->priodl <= ++ skiplist_entry(node, struct task_struct, sl_node)->priodl); ++} ++ ++/* ++ * Define the skip list insert function for PDS ++ */ ++DEFINE_SKIPLIST_INSERT_FUNC(pds_skiplist_insert, pds_skiplist_task_search); ++ ++/* ++ * Adding task to the runqueue. ++ * ++ * Context: rq->lock ++ */ ++static inline void enqueue_task(struct task_struct *p, struct rq *rq, int flags) ++{ ++ lockdep_assert_held(&rq->lock); ++ ++ WARN_ONCE(task_rq(p) != rq, "pds: enqueue task reside on cpu%d to cpu%d\n", ++ task_cpu(p), cpu_of(rq)); ++ ++ p->sl_node.level = p->sl_level; ++ if (pds_skiplist_insert(&rq->sl_header, &p->sl_node)) { ++ update_sched_rq_queued_masks(rq); ++ update_sched_rq_pending_masks(rq); ++ } else if (is_second_in_rq(p, rq)) ++ update_sched_rq_pending_masks(rq); ++ rq->nr_running++; ++ ++ sched_update_tick_dependency(rq); ++ ++ sched_info_queued(rq, p); ++ psi_enqueue(p, flags); ++ ++ /* ++ * If in_iowait is set, the code below may not trigger any cpufreq ++ * utilization updates, so do it here explicitly with the IOWAIT flag ++ * passed. ++ */ ++ if (p->in_iowait) ++ cpufreq_update_this_cpu(rq, SCHED_CPUFREQ_IOWAIT); ++} ++ ++static inline void requeue_task(struct task_struct *p, struct rq *rq) ++{ ++ bool b_first, b_second; ++ ++ lockdep_assert_held(&rq->lock); ++ ++ WARN_ONCE(task_rq(p) != rq, "pds: cpu[%d] requeue task reside on cpu%d\n", ++ cpu_of(rq), task_cpu(p)); ++ ++ b_first = skiplist_del_init(&rq->sl_header, &p->sl_node); ++ b_second = is_second_in_rq(p, rq); ++ ++ p->sl_node.level = p->sl_level; ++ if (pds_skiplist_insert(&rq->sl_header, &p->sl_node) || b_first) { ++ update_sched_rq_queued_masks(rq); ++ update_sched_rq_pending_masks(rq); ++ } else if (is_second_in_rq(p, rq) || b_second) ++ update_sched_rq_pending_masks(rq); ++} ++ ++/* ++ * resched_curr - mark rq's current task 'to be rescheduled now'. ++ * ++ * On UP this means the setting of the need_resched flag, on SMP it ++ * might also involve a cross-CPU call to trigger the scheduler on ++ * the target CPU. ++ */ ++void resched_curr(struct rq *rq) ++{ ++ struct task_struct *curr = rq->curr; ++ int cpu; ++ ++ lockdep_assert_held(&rq->lock); ++ ++ if (test_tsk_need_resched(curr)) ++ return; ++ ++ cpu = cpu_of(rq); ++ if (cpu == smp_processor_id()) { ++ set_tsk_need_resched(curr); ++ set_preempt_need_resched(); ++ return; ++ } ++ ++ if (set_nr_and_not_polling(curr)) ++ smp_send_reschedule(cpu); ++ else ++ trace_sched_wake_idle_without_ipi(cpu); ++} ++ ++static inline void check_preempt_curr(struct rq *rq, struct task_struct *p) ++{ ++ struct task_struct *curr = rq->curr; ++ ++ if (curr->prio == PRIO_LIMIT) ++ resched_curr(rq); ++ ++ if (task_running_idle(p)) ++ return; ++ ++ if (p->priodl < curr->priodl) ++ resched_curr(rq); ++} ++ ++#ifdef CONFIG_SCHED_HRTICK ++/* ++ * Use HR-timers to deliver accurate preemption points. ++ */ ++ ++static void hrtick_clear(struct rq *rq) ++{ ++ if (hrtimer_active(&rq->hrtick_timer)) ++ hrtimer_cancel(&rq->hrtick_timer); ++} ++ ++/* ++ * High-resolution timer tick. ++ * Runs from hardirq context with interrupts disabled. ++ */ ++static enum hrtimer_restart hrtick(struct hrtimer *timer) ++{ ++ struct rq *rq = container_of(timer, struct rq, hrtick_timer); ++ struct task_struct *p; ++ ++ WARN_ON_ONCE(cpu_of(rq) != smp_processor_id()); ++ ++ raw_spin_lock(&rq->lock); ++ p = rq->curr; ++ p->time_slice = 0; ++ resched_curr(rq); ++ raw_spin_unlock(&rq->lock); ++ ++ return HRTIMER_NORESTART; ++} ++ ++/* ++ * Use hrtick when: ++ * - enabled by features ++ * - hrtimer is actually high res ++ */ ++static inline int hrtick_enabled(struct rq *rq) ++{ ++ /** ++ * PDS doesn't support sched_feat yet ++ if (!sched_feat(HRTICK)) ++ return 0; ++ */ ++ if (!cpu_active(cpu_of(rq))) ++ return 0; ++ return hrtimer_is_hres_active(&rq->hrtick_timer); ++} ++ ++#ifdef CONFIG_SMP ++ ++static void __hrtick_restart(struct rq *rq) ++{ ++ struct hrtimer *timer = &rq->hrtick_timer; ++ ++ hrtimer_start_expires(timer, HRTIMER_MODE_ABS_PINNED_HARD); ++} ++ ++/* ++ * called from hardirq (IPI) context ++ */ ++static void __hrtick_start(void *arg) ++{ ++ struct rq *rq = arg; ++ ++ raw_spin_lock(&rq->lock); ++ __hrtick_restart(rq); ++ rq->hrtick_csd_pending = 0; ++ raw_spin_unlock(&rq->lock); ++} ++ ++/* ++ * Called to set the hrtick timer state. ++ * ++ * called with rq->lock held and irqs disabled ++ */ ++void hrtick_start(struct rq *rq, u64 delay) ++{ ++ struct hrtimer *timer = &rq->hrtick_timer; ++ ktime_t time; ++ s64 delta; ++ ++ /* ++ * Don't schedule slices shorter than 10000ns, that just ++ * doesn't make sense and can cause timer DoS. ++ */ ++ delta = max_t(s64, delay, 10000LL); ++ time = ktime_add_ns(timer->base->get_time(), delta); ++ ++ hrtimer_set_expires(timer, time); ++ ++ if (rq == this_rq()) { ++ __hrtick_restart(rq); ++ } else if (!rq->hrtick_csd_pending) { ++ smp_call_function_single_async(cpu_of(rq), &rq->hrtick_csd); ++ rq->hrtick_csd_pending = 1; ++ } ++} ++ ++#else ++/* ++ * Called to set the hrtick timer state. ++ * ++ * called with rq->lock held and irqs disabled ++ */ ++void hrtick_start(struct rq *rq, u64 delay) ++{ ++ /* ++ * Don't schedule slices shorter than 10000ns, that just ++ * doesn't make sense. Rely on vruntime for fairness. ++ */ ++ delay = max_t(u64, delay, 10000LL); ++ hrtimer_start(&rq->hrtick_timer, ns_to_ktime(delay), ++ HRTIMER_MODE_REL_PINNED_HARD); ++} ++#endif /* CONFIG_SMP */ ++ ++static void hrtick_rq_init(struct rq *rq) ++{ ++#ifdef CONFIG_SMP ++ rq->hrtick_csd_pending = 0; ++ ++ rq->hrtick_csd.flags = 0; ++ rq->hrtick_csd.func = __hrtick_start; ++ rq->hrtick_csd.info = rq; ++#endif ++ ++ hrtimer_init(&rq->hrtick_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL_HARD); ++ rq->hrtick_timer.function = hrtick; ++} ++ ++static inline int rq_dither(struct rq *rq) ++{ ++ if ((rq->clock - rq->last_tick > HALF_JIFFY_NS) || hrtick_enabled(rq)) ++ return 0; ++ ++ return HALF_JIFFY_NS; ++} ++ ++#else /* CONFIG_SCHED_HRTICK */ ++static inline int hrtick_enabled(struct rq *rq) ++{ ++ return 0; ++} ++ ++static inline void hrtick_clear(struct rq *rq) ++{ ++} ++ ++static inline void hrtick_rq_init(struct rq *rq) ++{ ++} ++ ++static inline int rq_dither(struct rq *rq) ++{ ++ return (rq->clock - rq->last_tick > HALF_JIFFY_NS)? 0:HALF_JIFFY_NS; ++} ++#endif /* CONFIG_SCHED_HRTICK */ ++ ++static inline int normal_prio(struct task_struct *p) ++{ ++ static const int policy_to_prio[] = { ++ NORMAL_PRIO, /* SCHED_NORMAL */ ++ 0, /* SCHED_FIFO */ ++ 0, /* SCHED_RR */ ++ IDLE_PRIO, /* SCHED_BATCH */ ++ ISO_PRIO, /* SCHED_ISO */ ++ IDLE_PRIO /* SCHED_IDLE */ ++ }; ++ ++ if (task_has_rt_policy(p)) ++ return MAX_RT_PRIO - 1 - p->rt_priority; ++ return policy_to_prio[p->policy]; ++} ++ ++/* ++ * Calculate the current priority, i.e. the priority ++ * taken into account by the scheduler. This value might ++ * be boosted by RT tasks as it will be RT if the task got ++ * RT-boosted. If not then it returns p->normal_prio. ++ */ ++static int effective_prio(struct task_struct *p) ++{ ++ p->normal_prio = normal_prio(p); ++ /* ++ * If we are RT tasks or we were boosted to RT priority, ++ * keep the priority unchanged. Otherwise, update priority ++ * to the normal priority: ++ */ ++ if (!rt_prio(p->prio)) ++ return p->normal_prio; ++ return p->prio; ++} ++ ++/* ++ * activate_task - move a task to the runqueue. ++ * ++ * Context: rq->lock ++ */ ++static void activate_task(struct task_struct *p, struct rq *rq) ++{ ++ if (task_contributes_to_load(p)) ++ rq->nr_uninterruptible--; ++ enqueue_task(p, rq, ENQUEUE_WAKEUP); ++ p->on_rq = 1; ++ cpufreq_update_this_cpu(rq, 0); ++} ++ ++/* ++ * deactivate_task - remove a task from the runqueue. ++ * ++ * Context: rq->lock ++ */ ++static inline void deactivate_task(struct task_struct *p, struct rq *rq) ++{ ++ if (task_contributes_to_load(p)) ++ rq->nr_uninterruptible++; ++ dequeue_task(p, rq, DEQUEUE_SLEEP); ++ p->on_rq = 0; ++ cpufreq_update_this_cpu(rq, 0); ++} ++ ++static inline void __set_task_cpu(struct task_struct *p, unsigned int cpu) ++{ ++#ifdef CONFIG_SMP ++ /* ++ * After ->cpu is set up to a new value, task_access_lock(p, ...) can be ++ * successfully executed on another CPU. We must ensure that updates of ++ * per-task data have been completed by this moment. ++ */ ++ smp_wmb(); ++ ++#ifdef CONFIG_THREAD_INFO_IN_TASK ++ WRITE_ONCE(p->cpu, cpu); ++#else ++ WRITE_ONCE(task_thread_info(p)->cpu, cpu); ++#endif ++#endif ++} ++ ++#ifdef CONFIG_SMP ++void set_task_cpu(struct task_struct *p, unsigned int new_cpu) ++{ ++#ifdef CONFIG_SCHED_DEBUG ++ /* ++ * We should never call set_task_cpu() on a blocked task, ++ * ttwu() will sort out the placement. ++ */ ++ WARN_ON_ONCE(p->state != TASK_RUNNING && p->state != TASK_WAKING && ++ !p->on_rq); ++#ifdef CONFIG_LOCKDEP ++ /* ++ * The caller should hold either p->pi_lock or rq->lock, when changing ++ * a task's CPU. ->pi_lock for waking tasks, rq->lock for runnable tasks. ++ * ++ * sched_move_task() holds both and thus holding either pins the cgroup, ++ * see task_group(). ++ */ ++ WARN_ON_ONCE(debug_locks && !(lockdep_is_held(&p->pi_lock) || ++ lockdep_is_held(&task_rq(p)->lock))); ++#endif ++ /* ++ * Clearly, migrating tasks to offline CPUs is a fairly daft thing. ++ */ ++ WARN_ON_ONCE(!cpu_online(new_cpu)); ++#endif ++ if (task_cpu(p) == new_cpu) ++ return; ++ trace_sched_migrate_task(p, new_cpu); ++ rseq_migrate(p); ++ perf_event_task_migrate(p); ++ ++ __set_task_cpu(p, new_cpu); ++} ++ ++static inline bool is_per_cpu_kthread(struct task_struct *p) ++{ ++ return ((p->flags & PF_KTHREAD) && (1 == p->nr_cpus_allowed)); ++} ++ ++/* ++ * Per-CPU kthreads are allowed to run on !active && online CPUs, see ++ * __set_cpus_allowed_ptr() and select_fallback_rq(). ++ */ ++static inline bool is_cpu_allowed(struct task_struct *p, int cpu) ++{ ++ if (!cpumask_test_cpu(cpu, &p->cpus_mask)) ++ return false; ++ ++ if (is_per_cpu_kthread(p)) ++ return cpu_online(cpu); ++ ++ return cpu_active(cpu); ++} ++ ++/* ++ * This is how migration works: ++ * ++ * 1) we invoke migration_cpu_stop() on the target CPU using ++ * stop_one_cpu(). ++ * 2) stopper starts to run (implicitly forcing the migrated thread ++ * off the CPU) ++ * 3) it checks whether the migrated task is still in the wrong runqueue. ++ * 4) if it's in the wrong runqueue then the migration thread removes ++ * it and puts it into the right queue. ++ * 5) stopper completes and stop_one_cpu() returns and the migration ++ * is done. ++ */ ++ ++/* ++ * move_queued_task - move a queued task to new rq. ++ * ++ * Returns (locked) new rq. Old rq's lock is released. ++ */ ++static struct rq *move_queued_task(struct rq *rq, struct task_struct *p, int ++ new_cpu) ++{ ++ lockdep_assert_held(&rq->lock); ++ ++ p->on_rq = TASK_ON_RQ_MIGRATING; ++ dequeue_task(p, rq, 0); ++ set_task_cpu(p, new_cpu); ++ raw_spin_unlock(&rq->lock); ++ ++ rq = cpu_rq(new_cpu); ++ ++ raw_spin_lock(&rq->lock); ++ BUG_ON(task_cpu(p) != new_cpu); ++ enqueue_task(p, rq, 0); ++ p->on_rq = TASK_ON_RQ_QUEUED; ++ check_preempt_curr(rq, p); ++ ++ return rq; ++} ++ ++struct migration_arg { ++ struct task_struct *task; ++ int dest_cpu; ++}; ++ ++/* ++ * Move (not current) task off this CPU, onto the destination CPU. We're doing ++ * this because either it can't run here any more (set_cpus_allowed() ++ * away from this CPU, or CPU going down), or because we're ++ * attempting to rebalance this task on exec (sched_exec). ++ * ++ * So we race with normal scheduler movements, but that's OK, as long ++ * as the task is no longer on this CPU. ++ */ ++static struct rq *__migrate_task(struct rq *rq, struct task_struct *p, int ++ dest_cpu) ++{ ++ /* Affinity changed (again). */ ++ if (!is_cpu_allowed(p, dest_cpu)) ++ return rq; ++ ++ update_rq_clock(rq); ++ return move_queued_task(rq, p, dest_cpu); ++} ++ ++/* ++ * migration_cpu_stop - this will be executed by a highprio stopper thread ++ * and performs thread migration by bumping thread off CPU then ++ * 'pushing' onto another runqueue. ++ */ ++static int migration_cpu_stop(void *data) ++{ ++ struct migration_arg *arg = data; ++ struct task_struct *p = arg->task; ++ struct rq *rq = this_rq(); ++ ++ /* ++ * The original target CPU might have gone down and we might ++ * be on another CPU but it doesn't matter. ++ */ ++ local_irq_disable(); ++ ++ raw_spin_lock(&p->pi_lock); ++ raw_spin_lock(&rq->lock); ++ /* ++ * If task_rq(p) != rq, it cannot be migrated here, because we're ++ * holding rq->lock, if p->on_rq == 0 it cannot get enqueued because ++ * we're holding p->pi_lock. ++ */ ++ if (task_rq(p) == rq) ++ if (task_on_rq_queued(p)) ++ rq = __migrate_task(rq, p, arg->dest_cpu); ++ raw_spin_unlock(&rq->lock); ++ raw_spin_unlock(&p->pi_lock); ++ ++ local_irq_enable(); ++ return 0; ++} ++ ++static inline void ++set_cpus_allowed_common(struct task_struct *p, const struct cpumask *new_mask) ++{ ++ cpumask_copy(&p->cpus_mask, new_mask); ++ p->nr_cpus_allowed = cpumask_weight(new_mask); ++} ++ ++void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask) ++{ ++ set_cpus_allowed_common(p, new_mask); ++} ++#endif ++ ++/* Enter with rq lock held. We know p is on the local CPU */ ++static inline void __set_tsk_resched(struct task_struct *p) ++{ ++ set_tsk_need_resched(p); ++ set_preempt_need_resched(); ++} ++ ++/** ++ * task_curr - is this task currently executing on a CPU? ++ * @p: the task in question. ++ * ++ * Return: 1 if the task is currently executing. 0 otherwise. ++ */ ++inline int task_curr(const struct task_struct *p) ++{ ++ return cpu_curr(task_cpu(p)) == p; ++} ++ ++#ifdef CONFIG_SMP ++/* ++ * wait_task_inactive - wait for a thread to unschedule. ++ * ++ * If @match_state is nonzero, it's the @p->state value just checked and ++ * not expected to change. If it changes, i.e. @p might have woken up, ++ * then return zero. When we succeed in waiting for @p to be off its CPU, ++ * we return a positive number (its total switch count). If a second call ++ * a short while later returns the same number, the caller can be sure that ++ * @p has remained unscheduled the whole time. ++ * ++ * The caller must ensure that the task *will* unschedule sometime soon, ++ * else this function might spin for a *long* time. This function can't ++ * be called with interrupts off, or it may introduce deadlock with ++ * smp_call_function() if an IPI is sent by the same process we are ++ * waiting to become inactive. ++ */ ++unsigned long wait_task_inactive(struct task_struct *p, long match_state) ++{ ++ unsigned long flags; ++ bool running, on_rq; ++ unsigned long ncsw; ++ struct rq *rq; ++ raw_spinlock_t *lock; ++ ++ for (;;) { ++ rq = task_rq(p); ++ ++ /* ++ * If the task is actively running on another CPU ++ * still, just relax and busy-wait without holding ++ * any locks. ++ * ++ * NOTE! Since we don't hold any locks, it's not ++ * even sure that "rq" stays as the right runqueue! ++ * But we don't care, since this will return false ++ * if the runqueue has changed and p is actually now ++ * running somewhere else! ++ */ ++ while (task_running(p) && p == rq->curr) { ++ if (match_state && unlikely(p->state != match_state)) ++ return 0; ++ cpu_relax(); ++ } ++ ++ /* ++ * Ok, time to look more closely! We need the rq ++ * lock now, to be *sure*. If we're wrong, we'll ++ * just go back and repeat. ++ */ ++ task_access_lock_irqsave(p, &lock, &flags); ++ trace_sched_wait_task(p); ++ running = task_running(p); ++ on_rq = p->on_rq; ++ ncsw = 0; ++ if (!match_state || p->state == match_state) ++ ncsw = p->nvcsw | LONG_MIN; /* sets MSB */ ++ task_access_unlock_irqrestore(p, lock, &flags); ++ ++ /* ++ * If it changed from the expected state, bail out now. ++ */ ++ if (unlikely(!ncsw)) ++ break; ++ ++ /* ++ * Was it really running after all now that we ++ * checked with the proper locks actually held? ++ * ++ * Oops. Go back and try again.. ++ */ ++ if (unlikely(running)) { ++ cpu_relax(); ++ continue; ++ } ++ ++ /* ++ * It's not enough that it's not actively running, ++ * it must be off the runqueue _entirely_, and not ++ * preempted! ++ * ++ * So if it was still runnable (but just not actively ++ * running right now), it's preempted, and we should ++ * yield - it could be a while. ++ */ ++ if (unlikely(on_rq)) { ++ ktime_t to = NSEC_PER_SEC / HZ; ++ ++ set_current_state(TASK_UNINTERRUPTIBLE); ++ schedule_hrtimeout(&to, HRTIMER_MODE_REL); ++ continue; ++ } ++ ++ /* ++ * Ahh, all good. It wasn't running, and it wasn't ++ * runnable, which means that it will never become ++ * running in the future either. We're all done! ++ */ ++ break; ++ } ++ ++ return ncsw; ++} ++ ++/*** ++ * kick_process - kick a running thread to enter/exit the kernel ++ * @p: the to-be-kicked thread ++ * ++ * Cause a process which is running on another CPU to enter ++ * kernel-mode, without any delay. (to get signals handled.) ++ * ++ * NOTE: this function doesn't have to take the runqueue lock, ++ * because all it wants to ensure is that the remote task enters ++ * the kernel. If the IPI races and the task has been migrated ++ * to another CPU then no harm is done and the purpose has been ++ * achieved as well. ++ */ ++void kick_process(struct task_struct *p) ++{ ++ int cpu; ++ ++ preempt_disable(); ++ cpu = task_cpu(p); ++ if ((cpu != smp_processor_id()) && task_curr(p)) ++ smp_send_reschedule(cpu); ++ preempt_enable(); ++} ++EXPORT_SYMBOL_GPL(kick_process); ++ ++/* ++ * ->cpus_mask is protected by both rq->lock and p->pi_lock ++ * ++ * A few notes on cpu_active vs cpu_online: ++ * ++ * - cpu_active must be a subset of cpu_online ++ * ++ * - on CPU-up we allow per-CPU kthreads on the online && !active CPU, ++ * see __set_cpus_allowed_ptr(). At this point the newly online ++ * CPU isn't yet part of the sched domains, and balancing will not ++ * see it. ++ * ++ * - on cpu-down we clear cpu_active() to mask the sched domains and ++ * avoid the load balancer to place new tasks on the to be removed ++ * CPU. Existing tasks will remain running there and will be taken ++ * off. ++ * ++ * This means that fallback selection must not select !active CPUs. ++ * And can assume that any active CPU must be online. Conversely ++ * select_task_rq() below may allow selection of !active CPUs in order ++ * to satisfy the above rules. ++ */ ++static int select_fallback_rq(int cpu, struct task_struct *p) ++{ ++ int nid = cpu_to_node(cpu); ++ const struct cpumask *nodemask = NULL; ++ enum { cpuset, possible, fail } state = cpuset; ++ int dest_cpu; ++ ++ /* ++ * If the node that the CPU is on has been offlined, cpu_to_node() ++ * will return -1. There is no CPU on the node, and we should ++ * select the CPU on the other node. ++ */ ++ if (nid != -1) { ++ nodemask = cpumask_of_node(nid); ++ ++ /* Look for allowed, online CPU in same node. */ ++ for_each_cpu(dest_cpu, nodemask) { ++ if (!cpu_active(dest_cpu)) ++ continue; ++ if (cpumask_test_cpu(dest_cpu, &p->cpus_mask)) ++ return dest_cpu; ++ } ++ } ++ ++ for (;;) { ++ /* Any allowed, online CPU? */ ++ for_each_cpu(dest_cpu, &p->cpus_mask) { ++ if (!is_cpu_allowed(p, dest_cpu)) ++ continue; ++ goto out; ++ } ++ ++ /* No more Mr. Nice Guy. */ ++ switch (state) { ++ case cpuset: ++ if (IS_ENABLED(CONFIG_CPUSETS)) { ++ cpuset_cpus_allowed_fallback(p); ++ state = possible; ++ break; ++ } ++ /* Fall-through */ ++ case possible: ++ do_set_cpus_allowed(p, cpu_possible_mask); ++ state = fail; ++ break; ++ ++ case fail: ++ BUG(); ++ break; ++ } ++ } ++ ++out: ++ if (state != cpuset) { ++ /* ++ * Don't tell them about moving exiting tasks or ++ * kernel threads (both mm NULL), since they never ++ * leave kernel. ++ */ ++ if (p->mm && printk_ratelimit()) { ++ printk_deferred("process %d (%s) no longer affine to cpu%d\n", ++ task_pid_nr(p), p->comm, cpu); ++ } ++ } ++ ++ return dest_cpu; ++} ++ ++static inline int best_mask_cpu(int cpu, const cpumask_t *cpumask) ++{ ++ cpumask_t *mask; ++ ++ if (cpumask_test_cpu(cpu, cpumask)) ++ return cpu; ++ ++ mask = &(per_cpu(sched_cpu_affinity_chk_masks, cpu)[0]); ++ while ((cpu = cpumask_any_and(cpumask, mask)) >= nr_cpu_ids) ++ mask++; ++ ++ return cpu; ++} ++ ++/* ++ * task_preemptible_rq - return the rq which the given task can preempt on ++ * @p: task wants to preempt CPU ++ * @only_preempt_low_policy: indicate only preempt rq running low policy than @p ++ */ ++static inline int ++task_preemptible_rq_idle(struct task_struct *p, cpumask_t *chk_mask) ++{ ++ cpumask_t tmp; ++ ++#ifdef CONFIG_SCHED_SMT ++ if (cpumask_and(&tmp, chk_mask, &sched_cpu_sg_idle_mask)) ++ return best_mask_cpu(task_cpu(p), &tmp); ++#endif ++ ++#ifdef CONFIG_SMT_NICE ++ /* Only ttwu on cpu which is not smt supressed */ ++ if (cpumask_andnot(&tmp, chk_mask, &sched_smt_supressed_mask)) { ++ cpumask_t t; ++ if (cpumask_and(&t, &tmp, &sched_rq_queued_masks[SCHED_RQ_EMPTY])) ++ return best_mask_cpu(task_cpu(p), &t); ++ return best_mask_cpu(task_cpu(p), &tmp); ++ } ++#endif ++ ++ if (cpumask_and(&tmp, chk_mask, &sched_rq_queued_masks[SCHED_RQ_EMPTY])) ++ return best_mask_cpu(task_cpu(p), &tmp); ++ return best_mask_cpu(task_cpu(p), chk_mask); ++} ++ ++static inline int ++task_preemptible_rq(struct task_struct *p, cpumask_t *chk_mask, ++ int preempt_level) ++{ ++ cpumask_t tmp; ++ int level; ++ ++#ifdef CONFIG_SCHED_SMT ++#ifdef CONFIG_SMT_NICE ++ if (cpumask_and(&tmp, chk_mask, &sched_cpu_psg_mask)) ++ return best_mask_cpu(task_cpu(p), &tmp); ++#else ++ if (cpumask_and(&tmp, chk_mask, &sched_cpu_sg_idle_mask)) ++ return best_mask_cpu(task_cpu(p), &tmp); ++#endif ++#endif ++ ++ level = find_first_bit(sched_rq_queued_masks_bitmap, ++ NR_SCHED_RQ_QUEUED_LEVEL); ++ ++ while (level < preempt_level) { ++ if (cpumask_and(&tmp, chk_mask, &sched_rq_queued_masks[level])) ++ return best_mask_cpu(task_cpu(p), &tmp); ++ ++ level = find_next_bit(sched_rq_queued_masks_bitmap, ++ NR_SCHED_RQ_QUEUED_LEVEL, ++ level + 1); ++ } ++ ++ if (unlikely(SCHED_RQ_RT == level && ++ level == preempt_level && ++ cpumask_and(&tmp, chk_mask, ++ &sched_rq_queued_masks[SCHED_RQ_RT]))) { ++ unsigned int cpu; ++ ++ for_each_cpu (cpu, &tmp) ++ if (p->prio < sched_rq_prio[cpu]) ++ return cpu; ++ } ++ ++ return best_mask_cpu(task_cpu(p), chk_mask); ++} ++ ++static inline int select_task_rq(struct task_struct *p) ++{ ++ cpumask_t chk_mask; ++ ++ if (unlikely(!cpumask_and(&chk_mask, &p->cpus_mask, cpu_online_mask))) ++ return select_fallback_rq(task_cpu(p), p); ++ ++ /* Check IDLE tasks suitable to run normal priority */ ++ if (idleprio_task(p)) { ++ if (idleprio_suitable(p)) { ++ p->prio = p->normal_prio; ++ update_task_priodl(p); ++ return task_preemptible_rq_idle(p, &chk_mask); ++ } ++ p->prio = NORMAL_PRIO; ++ update_task_priodl(p); ++ } ++ ++ return task_preemptible_rq(p, &chk_mask, ++ task_running_policy_level(p, this_rq())); ++} ++#else /* CONFIG_SMP */ ++static inline int select_task_rq(struct task_struct *p) ++{ ++ return 0; ++} ++#endif /* CONFIG_SMP */ ++ ++static void ++ttwu_stat(struct task_struct *p, int cpu, int wake_flags) ++{ ++ struct rq *rq; ++ ++ if (!schedstat_enabled()) ++ return; ++ ++ rq= this_rq(); ++ ++#ifdef CONFIG_SMP ++ if (cpu == rq->cpu) ++ __schedstat_inc(rq->ttwu_local); ++ else { ++ /** PDS ToDo: ++ * How to do ttwu_wake_remote ++ */ ++ } ++#endif /* CONFIG_SMP */ ++ ++ __schedstat_inc(rq->ttwu_count); ++} ++ ++/* ++ * Mark the task runnable and perform wakeup-preemption. ++ */ ++static inline void ++ttwu_do_wakeup(struct rq *rq, struct task_struct *p, int wake_flags) ++{ ++ p->state = TASK_RUNNING; ++ trace_sched_wakeup(p); ++} ++ ++static inline void ++ttwu_do_activate(struct rq *rq, struct task_struct *p, int wake_flags) ++{ ++#ifdef CONFIG_SMP ++ if (p->sched_contributes_to_load) ++ rq->nr_uninterruptible--; ++#endif ++ ++ activate_task(p, rq); ++ ttwu_do_wakeup(rq, p, 0); ++} ++ ++static int ttwu_remote(struct task_struct *p, int wake_flags) ++{ ++ struct rq *rq; ++ raw_spinlock_t *lock; ++ int ret = 0; ++ ++ rq = __task_access_lock(p, &lock); ++ if (task_on_rq_queued(p)) { ++ ttwu_do_wakeup(rq, p, wake_flags); ++ ret = 1; ++ } ++ __task_access_unlock(p, lock); ++ ++ return ret; ++} ++ ++/* ++ * Notes on Program-Order guarantees on SMP systems. ++ * ++ * MIGRATION ++ * ++ * The basic program-order guarantee on SMP systems is that when a task [t] ++ * migrates, all its activity on its old CPU [c0] happens-before any subsequent ++ * execution on its new CPU [c1]. ++ * ++ * For migration (of runnable tasks) this is provided by the following means: ++ * ++ * A) UNLOCK of the rq(c0)->lock scheduling out task t ++ * B) migration for t is required to synchronize *both* rq(c0)->lock and ++ * rq(c1)->lock (if not at the same time, then in that order). ++ * C) LOCK of the rq(c1)->lock scheduling in task ++ * ++ * Transitivity guarantees that B happens after A and C after B. ++ * Note: we only require RCpc transitivity. ++ * Note: the CPU doing B need not be c0 or c1 ++ * ++ * Example: ++ * ++ * CPU0 CPU1 CPU2 ++ * ++ * LOCK rq(0)->lock ++ * sched-out X ++ * sched-in Y ++ * UNLOCK rq(0)->lock ++ * ++ * LOCK rq(0)->lock // orders against CPU0 ++ * dequeue X ++ * UNLOCK rq(0)->lock ++ * ++ * LOCK rq(1)->lock ++ * enqueue X ++ * UNLOCK rq(1)->lock ++ * ++ * LOCK rq(1)->lock // orders against CPU2 ++ * sched-out Z ++ * sched-in X ++ * UNLOCK rq(1)->lock ++ * ++ * ++ * BLOCKING -- aka. SLEEP + WAKEUP ++ * ++ * For blocking we (obviously) need to provide the same guarantee as for ++ * migration. However the means are completely different as there is no lock ++ * chain to provide order. Instead we do: ++ * ++ * 1) smp_store_release(X->on_cpu, 0) ++ * 2) smp_cond_load_acquire(!X->on_cpu) ++ * ++ * Example: ++ * ++ * CPU0 (schedule) CPU1 (try_to_wake_up) CPU2 (schedule) ++ * ++ * LOCK rq(0)->lock LOCK X->pi_lock ++ * dequeue X ++ * sched-out X ++ * smp_store_release(X->on_cpu, 0); ++ * ++ * smp_cond_load_acquire(&X->on_cpu, !VAL); ++ * X->state = WAKING ++ * set_task_cpu(X,2) ++ * ++ * LOCK rq(2)->lock ++ * enqueue X ++ * X->state = RUNNING ++ * UNLOCK rq(2)->lock ++ * ++ * LOCK rq(2)->lock // orders against CPU1 ++ * sched-out Z ++ * sched-in X ++ * UNLOCK rq(2)->lock ++ * ++ * UNLOCK X->pi_lock ++ * UNLOCK rq(0)->lock ++ * ++ * ++ * However; for wakeups there is a second guarantee we must provide, namely we ++ * must observe the state that lead to our wakeup. That is, not only must our ++ * task observe its own prior state, it must also observe the stores prior to ++ * its wakeup. ++ * ++ * This means that any means of doing remote wakeups must order the CPU doing ++ * the wakeup against the CPU the task is going to end up running on. This, ++ * however, is already required for the regular Program-Order guarantee above, ++ * since the waking CPU is the one issueing the ACQUIRE (smp_cond_load_acquire). ++ * ++ */ ++ ++/*** ++ * try_to_wake_up - wake up a thread ++ * @p: the thread to be awakened ++ * @state: the mask of task states that can be woken ++ * @wake_flags: wake modifier flags (WF_*) ++ * ++ * Put it on the run-queue if it's not already there. The "current" ++ * thread is always on the run-queue (except when the actual ++ * re-schedule is in progress), and as such you're allowed to do ++ * the simpler "current->state = TASK_RUNNING" to mark yourself ++ * runnable without the overhead of this. ++ * ++ * Return: %true if @p was woken up, %false if it was already running. ++ * or @state didn't match @p's state. ++ */ ++static int try_to_wake_up(struct task_struct *p, unsigned int state, ++ int wake_flags) ++{ ++ unsigned long flags; ++ struct rq *rq; ++ int cpu, success = 0; ++ ++ /* ++ * If we are going to wake up a thread waiting for CONDITION we ++ * need to ensure that CONDITION=1 done by the caller can not be ++ * reordered with p->state check below. This pairs with mb() in ++ * set_current_state() the waiting thread does. ++ */ ++ raw_spin_lock_irqsave(&p->pi_lock, flags); ++ smp_mb__after_spinlock(); ++ if (!(p->state & state)) ++ goto out; ++ ++ trace_sched_waking(p); ++ ++ /* We're going to change ->state: */ ++ success = 1; ++ cpu = task_cpu(p); ++ ++ /* ++ * Ensure we load p->on_rq _after_ p->state, otherwise it would ++ * be possible to, falsely, observe p->on_rq == 0 and get stuck ++ * in smp_cond_load_acquire() below. ++ * ++ * sched_ttwu_pending() try_to_wake_up() ++ * STORE p->on_rq = 1 LOAD p->state ++ * UNLOCK rq->lock ++ * ++ * __schedule() (switch to task 'p') ++ * LOCK rq->lock smp_rmb(); ++ * smp_mb__after_spinlock(); ++ * UNLOCK rq->lock ++ * ++ * [task p] ++ * STORE p->state = UNINTERRUPTIBLE LOAD p->on_rq ++ * ++ * Pairs with the LOCK+smp_mb__after_spinlock() on rq->lock in ++ * __schedule(). See the comment for smp_mb__after_spinlock(). ++ */ ++ smp_rmb(); ++ if (p->on_rq && ttwu_remote(p, wake_flags)) ++ goto stat; ++ ++#ifdef CONFIG_SMP ++ /* ++ * Ensure we load p->on_cpu _after_ p->on_rq, otherwise it would be ++ * possible to, falsely, observe p->on_cpu == 0. ++ * ++ * One must be running (->on_cpu == 1) in order to remove oneself ++ * from the runqueue. ++ * ++ * __schedule() (switch to task 'p') try_to_wake_up() ++ * STORE p->on_cpu = 1 LOAD p->on_rq ++ * UNLOCK rq->lock ++ * ++ * __schedule() (put 'p' to sleep) ++ * LOCK rq->lock smp_rmb(); ++ * smp_mb__after_spinlock(); ++ * STORE p->on_rq = 0 LOAD p->on_cpu ++ * ++ * Pairs with the LOCK+smp_mb__after_spinlock() on rq->lock in ++ * __schedule(). See the comment for smp_mb__after_spinlock(). ++ */ ++ smp_rmb(); ++ ++ /* ++ * If the owning (remote) CPU is still in the middle of schedule() with ++ * this task as prev, wait until its done referencing the task. ++ * ++ * Pairs with the smp_store_release() in finish_task(). ++ * ++ * This ensures that tasks getting woken will be fully ordered against ++ * their previous state and preserve Program Order. ++ */ ++ smp_cond_load_acquire(&p->on_cpu, !VAL); ++ ++ p->sched_contributes_to_load = !!task_contributes_to_load(p); ++ p->state = TASK_WAKING; ++ ++ if (p->in_iowait) { ++ delayacct_blkio_end(p); ++ atomic_dec(&task_rq(p)->nr_iowait); ++ } ++ ++ if (SCHED_ISO == p->policy && ISO_PRIO != p->prio) { ++ p->prio = ISO_PRIO; ++ p->deadline = 0UL; ++ update_task_priodl(p); ++ } ++ ++ cpu = select_task_rq(p); ++ ++ if (cpu != task_cpu(p)) { ++ wake_flags |= WF_MIGRATED; ++ psi_ttwu_dequeue(p); ++ set_task_cpu(p, cpu); ++ } ++#else /* CONFIG_SMP */ ++ if (p->in_iowait) { ++ delayacct_blkio_end(p); ++ atomic_dec(&task_rq(p)->nr_iowait); ++ } ++#endif ++ ++ rq = cpu_rq(cpu); ++ raw_spin_lock(&rq->lock); ++ ++ update_rq_clock(rq); ++ ttwu_do_activate(rq, p, wake_flags); ++ check_preempt_curr(rq, p); ++ ++ raw_spin_unlock(&rq->lock); ++ ++stat: ++ ttwu_stat(p, cpu, wake_flags); ++out: ++ raw_spin_unlock_irqrestore(&p->pi_lock, flags); ++ ++ return success; ++} ++ ++/** ++ * wake_up_process - Wake up a specific process ++ * @p: The process to be woken up. ++ * ++ * Attempt to wake up the nominated process and move it to the set of runnable ++ * processes. ++ * ++ * Return: 1 if the process was woken up, 0 if it was already running. ++ * ++ * This function executes a full memory barrier before accessing the task state. ++ */ ++int wake_up_process(struct task_struct *p) ++{ ++ return try_to_wake_up(p, TASK_NORMAL, 0); ++} ++EXPORT_SYMBOL(wake_up_process); ++ ++int wake_up_state(struct task_struct *p, unsigned int state) ++{ ++ return try_to_wake_up(p, state, 0); ++} ++ ++/* ++ * Perform scheduler related setup for a newly forked process p. ++ * p is forked by current. ++ */ ++int sched_fork(unsigned long __maybe_unused clone_flags, struct task_struct *p) ++{ ++ unsigned long flags; ++ int cpu = get_cpu(); ++ struct rq *rq = this_rq(); ++ ++#ifdef CONFIG_PREEMPT_NOTIFIERS ++ INIT_HLIST_HEAD(&p->preempt_notifiers); ++#endif ++ /* Should be reset in fork.c but done here for ease of PDS patching */ ++ p->on_cpu = ++ p->on_rq = ++ p->utime = ++ p->stime = ++ p->sched_time = 0; ++ ++ p->sl_level = pds_skiplist_random_level(p); ++ INIT_SKIPLIST_NODE(&p->sl_node); ++ ++#ifdef CONFIG_COMPACTION ++ p->capture_control = NULL; ++#endif ++ ++ /* ++ * We mark the process as NEW here. This guarantees that ++ * nobody will actually run it, and a signal or other external ++ * event cannot wake it up and insert it on the runqueue either. ++ */ ++ p->state = TASK_NEW; ++ ++ /* ++ * Make sure we do not leak PI boosting priority to the child. ++ */ ++ p->prio = current->normal_prio; ++ ++ /* ++ * Revert to default priority/policy on fork if requested. ++ */ ++ if (unlikely(p->sched_reset_on_fork)) { ++ if (task_has_rt_policy(p)) { ++ p->policy = SCHED_NORMAL; ++ p->static_prio = NICE_TO_PRIO(0); ++ p->rt_priority = 0; ++ } else if (PRIO_TO_NICE(p->static_prio) < 0) ++ p->static_prio = NICE_TO_PRIO(0); ++ ++ p->prio = p->normal_prio = normal_prio(p); ++ ++ /* ++ * We don't need the reset flag anymore after the fork. It has ++ * fulfilled its duty: ++ */ ++ p->sched_reset_on_fork = 0; ++ } ++ ++ /* ++ * Share the timeslice between parent and child, thus the ++ * total amount of pending timeslices in the system doesn't change, ++ * resulting in more scheduling fairness. ++ */ ++ raw_spin_lock_irqsave(&rq->lock, flags); ++ rq->curr->time_slice /= 2; ++ p->time_slice = rq->curr->time_slice; ++#ifdef CONFIG_SCHED_HRTICK ++ hrtick_start(rq, US_TO_NS(rq->curr->time_slice)); ++#endif ++ ++ if (p->time_slice < RESCHED_US) { ++ update_rq_clock(rq); ++ time_slice_expired(p, rq); ++ resched_curr(rq); ++ } else ++ update_task_priodl(p); ++ raw_spin_unlock_irqrestore(&rq->lock, flags); ++ ++ /* ++ * The child is not yet in the pid-hash so no cgroup attach races, ++ * and the cgroup is pinned to this child due to cgroup_fork() ++ * is ran before sched_fork(). ++ * ++ * Silence PROVE_RCU. ++ */ ++ raw_spin_lock_irqsave(&p->pi_lock, flags); ++ /* ++ * We're setting the CPU for the first time, we don't migrate, ++ * so use __set_task_cpu(). ++ */ ++ __set_task_cpu(p, cpu); ++ raw_spin_unlock_irqrestore(&p->pi_lock, flags); ++ ++#ifdef CONFIG_SCHED_INFO ++ if (unlikely(sched_info_on())) ++ memset(&p->sched_info, 0, sizeof(p->sched_info)); ++#endif ++ init_task_preempt_count(p); ++ ++ put_cpu(); ++ return 0; ++} ++ ++#ifdef CONFIG_SCHEDSTATS ++ ++DEFINE_STATIC_KEY_FALSE(sched_schedstats); ++static bool __initdata __sched_schedstats = false; ++ ++static void set_schedstats(bool enabled) ++{ ++ if (enabled) ++ static_branch_enable(&sched_schedstats); ++ else ++ static_branch_disable(&sched_schedstats); ++} ++ ++void force_schedstat_enabled(void) ++{ ++ if (!schedstat_enabled()) { ++ pr_info("kernel profiling enabled schedstats, disable via kernel.sched_schedstats.\n"); ++ static_branch_enable(&sched_schedstats); ++ } ++} ++ ++static int __init setup_schedstats(char *str) ++{ ++ int ret = 0; ++ if (!str) ++ goto out; ++ ++ /* ++ * This code is called before jump labels have been set up, so we can't ++ * change the static branch directly just yet. Instead set a temporary ++ * variable so init_schedstats() can do it later. ++ */ ++ if (!strcmp(str, "enable")) { ++ __sched_schedstats = true; ++ ret = 1; ++ } else if (!strcmp(str, "disable")) { ++ __sched_schedstats = false; ++ ret = 1; ++ } ++out: ++ if (!ret) ++ pr_warn("Unable to parse schedstats=\n"); ++ ++ return ret; ++} ++__setup("schedstats=", setup_schedstats); ++ ++static void __init init_schedstats(void) ++{ ++ set_schedstats(__sched_schedstats); ++} ++ ++#ifdef CONFIG_PROC_SYSCTL ++int sysctl_schedstats(struct ctl_table *table, int write, ++ void __user *buffer, size_t *lenp, loff_t *ppos) ++{ ++ struct ctl_table t; ++ int err; ++ int state = static_branch_likely(&sched_schedstats); ++ ++ if (write && !capable(CAP_SYS_ADMIN)) ++ return -EPERM; ++ ++ t = *table; ++ t.data = &state; ++ err = proc_dointvec_minmax(&t, write, buffer, lenp, ppos); ++ if (err < 0) ++ return err; ++ if (write) ++ set_schedstats(state); ++ return err; ++} ++#endif /* CONFIG_PROC_SYSCTL */ ++#else /* !CONFIG_SCHEDSTATS */ ++static inline void init_schedstats(void) {} ++#endif /* CONFIG_SCHEDSTATS */ ++ ++/* ++ * wake_up_new_task - wake up a newly created task for the first time. ++ * ++ * This function will do some initial scheduler statistics housekeeping ++ * that must be done for every newly created context, then puts the task ++ * on the runqueue and wakes it. ++ */ ++void wake_up_new_task(struct task_struct *p) ++{ ++ unsigned long flags; ++ struct rq *rq; ++ ++ raw_spin_lock_irqsave(&p->pi_lock, flags); ++ ++ p->state = TASK_RUNNING; ++ ++ rq = cpu_rq(select_task_rq(p)); ++#ifdef CONFIG_SMP ++ /* ++ * Fork balancing, do it here and not earlier because: ++ * - cpus_mask can change in the fork path ++ * - any previously selected CPU might disappear through hotplug ++ * Use __set_task_cpu() to avoid calling sched_class::migrate_task_rq, ++ * as we're not fully set-up yet. ++ */ ++ __set_task_cpu(p, cpu_of(rq)); ++#endif ++ ++ raw_spin_lock(&rq->lock); ++ ++ update_rq_clock(rq); ++ activate_task(p, rq); ++ trace_sched_wakeup_new(p); ++ check_preempt_curr(rq, p); ++ ++ raw_spin_unlock(&rq->lock); ++ raw_spin_unlock_irqrestore(&p->pi_lock, flags); ++} ++ ++#ifdef CONFIG_PREEMPT_NOTIFIERS ++ ++static DEFINE_STATIC_KEY_FALSE(preempt_notifier_key); ++ ++void preempt_notifier_inc(void) ++{ ++ static_branch_inc(&preempt_notifier_key); ++} ++EXPORT_SYMBOL_GPL(preempt_notifier_inc); ++ ++void preempt_notifier_dec(void) ++{ ++ static_branch_dec(&preempt_notifier_key); ++} ++EXPORT_SYMBOL_GPL(preempt_notifier_dec); ++ ++/** ++ * preempt_notifier_register - tell me when current is being preempted & rescheduled ++ * @notifier: notifier struct to register ++ */ ++void preempt_notifier_register(struct preempt_notifier *notifier) ++{ ++ if (!static_branch_unlikely(&preempt_notifier_key)) ++ WARN(1, "registering preempt_notifier while notifiers disabled\n"); ++ ++ hlist_add_head(¬ifier->link, ¤t->preempt_notifiers); ++} ++EXPORT_SYMBOL_GPL(preempt_notifier_register); ++ ++/** ++ * preempt_notifier_unregister - no longer interested in preemption notifications ++ * @notifier: notifier struct to unregister ++ * ++ * This is *not* safe to call from within a preemption notifier. ++ */ ++void preempt_notifier_unregister(struct preempt_notifier *notifier) ++{ ++ hlist_del(¬ifier->link); ++} ++EXPORT_SYMBOL_GPL(preempt_notifier_unregister); ++ ++static void __fire_sched_in_preempt_notifiers(struct task_struct *curr) ++{ ++ struct preempt_notifier *notifier; ++ ++ hlist_for_each_entry(notifier, &curr->preempt_notifiers, link) ++ notifier->ops->sched_in(notifier, raw_smp_processor_id()); ++} ++ ++static __always_inline void fire_sched_in_preempt_notifiers(struct task_struct *curr) ++{ ++ if (static_branch_unlikely(&preempt_notifier_key)) ++ __fire_sched_in_preempt_notifiers(curr); ++} ++ ++static void ++__fire_sched_out_preempt_notifiers(struct task_struct *curr, ++ struct task_struct *next) ++{ ++ struct preempt_notifier *notifier; ++ ++ hlist_for_each_entry(notifier, &curr->preempt_notifiers, link) ++ notifier->ops->sched_out(notifier, next); ++} ++ ++static __always_inline void ++fire_sched_out_preempt_notifiers(struct task_struct *curr, ++ struct task_struct *next) ++{ ++ if (static_branch_unlikely(&preempt_notifier_key)) ++ __fire_sched_out_preempt_notifiers(curr, next); ++} ++ ++#else /* !CONFIG_PREEMPT_NOTIFIERS */ ++ ++static inline void fire_sched_in_preempt_notifiers(struct task_struct *curr) ++{ ++} ++ ++static inline void ++fire_sched_out_preempt_notifiers(struct task_struct *curr, ++ struct task_struct *next) ++{ ++} ++ ++#endif /* CONFIG_PREEMPT_NOTIFIERS */ ++ ++static inline void prepare_task(struct task_struct *next) ++{ ++ /* ++ * Claim the task as running, we do this before switching to it ++ * such that any running task will have this set. ++ */ ++ next->on_cpu = 1; ++} ++ ++static inline void finish_task(struct task_struct *prev) ++{ ++#ifdef CONFIG_SMP ++ /* ++ * After ->on_cpu is cleared, the task can be moved to a different CPU. ++ * We must ensure this doesn't happen until the switch is completely ++ * finished. ++ * ++ * In particular, the load of prev->state in finish_task_switch() must ++ * happen before this. ++ * ++ * Pairs with the smp_cond_load_acquire() in try_to_wake_up(). ++ */ ++ smp_store_release(&prev->on_cpu, 0); ++#else ++ prev->on_cpu = 0; ++#endif ++} ++ ++static inline void ++prepare_lock_switch(struct rq *rq, struct task_struct *next) ++{ ++ /* ++ * Since the runqueue lock will be released by the next ++ * task (which is an invalid locking op but in the case ++ * of the scheduler it's an obvious special-case), so we ++ * do an early lockdep release here: ++ */ ++ spin_release(&rq->lock.dep_map, _THIS_IP_); ++#ifdef CONFIG_DEBUG_SPINLOCK ++ /* this is a valid case when another task releases the spinlock */ ++ rq->lock.owner = next; ++#endif ++} ++ ++static inline void finish_lock_switch(struct rq *rq) ++{ ++ /* ++ * If we are tracking spinlock dependencies then we have to ++ * fix up the runqueue lock - which gets 'carried over' from ++ * prev into current: ++ */ ++ spin_acquire(&rq->lock.dep_map, 0, 0, _THIS_IP_); ++ raw_spin_unlock_irq(&rq->lock); ++} ++ ++/** ++ * prepare_task_switch - prepare to switch tasks ++ * @rq: the runqueue preparing to switch ++ * @next: the task we are going to switch to. ++ * ++ * This is called with the rq lock held and interrupts off. It must ++ * be paired with a subsequent finish_task_switch after the context ++ * switch. ++ * ++ * prepare_task_switch sets up locking and calls architecture specific ++ * hooks. ++ */ ++static inline void ++prepare_task_switch(struct rq *rq, struct task_struct *prev, ++ struct task_struct *next) ++{ ++ kcov_prepare_switch(prev); ++ sched_info_switch(rq, prev, next); ++ perf_event_task_sched_out(prev, next); ++ rseq_preempt(prev); ++ fire_sched_out_preempt_notifiers(prev, next); ++ prepare_task(next); ++ prepare_arch_switch(next); ++} ++ ++/** ++ * finish_task_switch - clean up after a task-switch ++ * @rq: runqueue associated with task-switch ++ * @prev: the thread we just switched away from. ++ * ++ * finish_task_switch must be called after the context switch, paired ++ * with a prepare_task_switch call before the context switch. ++ * finish_task_switch will reconcile locking set up by prepare_task_switch, ++ * and do any other architecture-specific cleanup actions. ++ * ++ * Note that we may have delayed dropping an mm in context_switch(). If ++ * so, we finish that here outside of the runqueue lock. (Doing it ++ * with the lock held can cause deadlocks; see schedule() for ++ * details.) ++ * ++ * The context switch have flipped the stack from under us and restored the ++ * local variables which were saved when this task called schedule() in the ++ * past. prev == current is still correct but we need to recalculate this_rq ++ * because prev may have moved to another CPU. ++ */ ++static struct rq *finish_task_switch(struct task_struct *prev) ++ __releases(rq->lock) ++{ ++ struct rq *rq = this_rq(); ++ struct mm_struct *mm = rq->prev_mm; ++ long prev_state; ++ ++ /* ++ * The previous task will have left us with a preempt_count of 2 ++ * because it left us after: ++ * ++ * schedule() ++ * preempt_disable(); // 1 ++ * __schedule() ++ * raw_spin_lock_irq(&rq->lock) // 2 ++ * ++ * Also, see FORK_PREEMPT_COUNT. ++ */ ++ if (WARN_ONCE(preempt_count() != 2*PREEMPT_DISABLE_OFFSET, ++ "corrupted preempt_count: %s/%d/0x%x\n", ++ current->comm, current->pid, preempt_count())) ++ preempt_count_set(FORK_PREEMPT_COUNT); ++ ++ rq->prev_mm = NULL; ++ ++ /* ++ * A task struct has one reference for the use as "current". ++ * If a task dies, then it sets TASK_DEAD in tsk->state and calls ++ * schedule one last time. The schedule call will never return, and ++ * the scheduled task must drop that reference. ++ * ++ * We must observe prev->state before clearing prev->on_cpu (in ++ * finish_task), otherwise a concurrent wakeup can get prev ++ * running on another CPU and we could rave with its RUNNING -> DEAD ++ * transition, resulting in a double drop. ++ */ ++ prev_state = prev->state; ++ vtime_task_switch(prev); ++ perf_event_task_sched_in(prev, current); ++ finish_task(prev); ++ finish_lock_switch(rq); ++ finish_arch_post_lock_switch(); ++ kcov_finish_switch(current); ++ ++ fire_sched_in_preempt_notifiers(current); ++ /* ++ * When switching through a kernel thread, the loop in ++ * membarrier_{private,global}_expedited() may have observed that ++ * kernel thread and not issued an IPI. It is therefore possible to ++ * schedule between user->kernel->user threads without passing though ++ * switch_mm(). Membarrier requires a barrier after storing to ++ * rq->curr, before returning to userspace, so provide them here: ++ * ++ * - a full memory barrier for {PRIVATE,GLOBAL}_EXPEDITED, implicitly ++ * provided by mmdrop(), ++ * - a sync_core for SYNC_CORE. ++ */ ++ if (mm) { ++ membarrier_mm_sync_core_before_usermode(mm); ++ mmdrop(mm); ++ } ++ if (unlikely(prev_state == TASK_DEAD)) { ++ /* ++ * Remove function-return probe instances associated with this ++ * task and put them back on the free list. ++ */ ++ kprobe_flush_task(prev); ++ ++ /* Task is done with its stack. */ ++ put_task_stack(prev); ++ ++ put_task_struct_rcu_user(prev); ++ } ++ ++ tick_nohz_task_switch(); ++ return rq; ++} ++ ++/** ++ * schedule_tail - first thing a freshly forked thread must call. ++ * @prev: the thread we just switched away from. ++ */ ++asmlinkage __visible void schedule_tail(struct task_struct *prev) ++ __releases(rq->lock) ++{ ++ struct rq *rq; ++ ++ /* ++ * New tasks start with FORK_PREEMPT_COUNT, see there and ++ * finish_task_switch() for details. ++ * ++ * finish_task_switch() will drop rq->lock() and lower preempt_count ++ * and the preempt_enable() will end up enabling preemption (on ++ * PREEMPT_COUNT kernels). ++ */ ++ ++ rq = finish_task_switch(prev); ++ preempt_enable(); ++ ++ if (current->set_child_tid) ++ put_user(task_pid_vnr(current), current->set_child_tid); ++ ++ calculate_sigpending(); ++} ++ ++/* ++ * context_switch - switch to the new MM and the new thread's register state. ++ */ ++static __always_inline struct rq * ++context_switch(struct rq *rq, struct task_struct *prev, ++ struct task_struct *next) ++{ ++ prepare_task_switch(rq, prev, next); ++ ++ /* ++ * For paravirt, this is coupled with an exit in switch_to to ++ * combine the page table reload and the switch backend into ++ * one hypercall. ++ */ ++ arch_start_context_switch(prev); ++ ++ /* ++ * kernel -> kernel lazy + transfer active ++ * user -> kernel lazy + mmgrab() active ++ * ++ * kernel -> user switch + mmdrop() active ++ * user -> user switch ++ */ ++ if (!next->mm) { // to kernel ++ enter_lazy_tlb(prev->active_mm, next); ++ ++ next->active_mm = prev->active_mm; ++ if (prev->mm) // from user ++ mmgrab(prev->active_mm); ++ else ++ prev->active_mm = NULL; ++ } else { // to user ++ membarrier_switch_mm(rq, prev->active_mm, next->mm); ++ /* ++ * sys_membarrier() requires an smp_mb() between setting ++ * rq->curr / membarrier_switch_mm() and returning to userspace. ++ * ++ * The below provides this either through switch_mm(), or in ++ * case 'prev->active_mm == next->mm' through ++ * finish_task_switch()'s mmdrop(). ++ */ ++ switch_mm_irqs_off(prev->active_mm, next->mm, next); ++ ++ if (!prev->mm) { // from kernel ++ /* will mmdrop() in finish_task_switch(). */ ++ rq->prev_mm = prev->active_mm; ++ prev->active_mm = NULL; ++ } ++ } ++ ++ prepare_lock_switch(rq, next); ++ ++ /* Here we just switch the register state and the stack. */ ++ switch_to(prev, next, prev); ++ barrier(); ++ ++ return finish_task_switch(prev); ++} ++ ++/* ++ * nr_running, nr_uninterruptible and nr_context_switches: ++ * ++ * externally visible scheduler statistics: current number of runnable ++ * threads, total number of context switches performed since bootup. ++ */ ++unsigned long nr_running(void) ++{ ++ unsigned long i, sum = 0; ++ ++ for_each_online_cpu(i) ++ sum += cpu_rq(i)->nr_running; ++ ++ return sum; ++} ++ ++/* ++ * Check if only the current task is running on the CPU. ++ * ++ * Caution: this function does not check that the caller has disabled ++ * preemption, thus the result might have a time-of-check-to-time-of-use ++ * race. The caller is responsible to use it correctly, for example: ++ * ++ * - from a non-preemptible section (of course) ++ * ++ * - from a thread that is bound to a single CPU ++ * ++ * - in a loop with very short iterations (e.g. a polling loop) ++ */ ++bool single_task_running(void) ++{ ++ return raw_rq()->nr_running == 1; ++} ++EXPORT_SYMBOL(single_task_running); ++ ++unsigned long long nr_context_switches(void) ++{ ++ int i; ++ unsigned long long sum = 0; ++ ++ for_each_possible_cpu(i) ++ sum += cpu_rq(i)->nr_switches; ++ ++ return sum; ++} ++ ++/* ++ * Consumers of these two interfaces, like for example the cpuidle menu ++ * governor, are using nonsensical data. Preferring shallow idle state selection ++ * for a CPU that has IO-wait which might not even end up running the task when ++ * it does become runnable. ++ */ ++ ++unsigned long nr_iowait_cpu(int cpu) ++{ ++ return atomic_read(&cpu_rq(cpu)->nr_iowait); ++} ++ ++/* ++ * IO-wait accounting, and how its mostly bollocks (on SMP). ++ * ++ * The idea behind IO-wait account is to account the idle time that we could ++ * have spend running if it were not for IO. That is, if we were to improve the ++ * storage performance, we'd have a proportional reduction in IO-wait time. ++ * ++ * This all works nicely on UP, where, when a task blocks on IO, we account ++ * idle time as IO-wait, because if the storage were faster, it could've been ++ * running and we'd not be idle. ++ * ++ * This has been extended to SMP, by doing the same for each CPU. This however ++ * is broken. ++ * ++ * Imagine for instance the case where two tasks block on one CPU, only the one ++ * CPU will have IO-wait accounted, while the other has regular idle. Even ++ * though, if the storage were faster, both could've ran at the same time, ++ * utilising both CPUs. ++ * ++ * This means, that when looking globally, the current IO-wait accounting on ++ * SMP is a lower bound, by reason of under accounting. ++ * ++ * Worse, since the numbers are provided per CPU, they are sometimes ++ * interpreted per CPU, and that is nonsensical. A blocked task isn't strictly ++ * associated with any one particular CPU, it can wake to another CPU than it ++ * blocked on. This means the per CPU IO-wait number is meaningless. ++ * ++ * Task CPU affinities can make all that even more 'interesting'. ++ */ ++ ++unsigned long nr_iowait(void) ++{ ++ unsigned long i, sum = 0; ++ ++ for_each_possible_cpu(i) ++ sum += nr_iowait_cpu(i); ++ ++ return sum; ++} ++ ++DEFINE_PER_CPU(struct kernel_stat, kstat); ++DEFINE_PER_CPU(struct kernel_cpustat, kernel_cpustat); ++ ++EXPORT_PER_CPU_SYMBOL(kstat); ++EXPORT_PER_CPU_SYMBOL(kernel_cpustat); ++ ++static inline void pds_update_curr(struct rq *rq, struct task_struct *p) ++{ ++ s64 ns = rq->clock_task - p->last_ran; ++ ++ p->sched_time += ns; ++ account_group_exec_runtime(p, ns); ++ ++ /* time_slice accounting is done in usecs to avoid overflow on 32bit */ ++ p->time_slice -= NS_TO_US(ns); ++ p->last_ran = rq->clock_task; ++} ++ ++/* ++ * Return accounted runtime for the task. ++ * Return separately the current's pending runtime that have not been ++ * accounted yet. ++ */ ++unsigned long long task_sched_runtime(struct task_struct *p) ++{ ++ unsigned long flags; ++ struct rq *rq; ++ raw_spinlock_t *lock; ++ u64 ns; ++ ++#if defined(CONFIG_64BIT) && defined(CONFIG_SMP) ++ /* ++ * 64-bit doesn't need locks to atomically read a 64-bit value. ++ * So we have a optimization chance when the task's delta_exec is 0. ++ * Reading ->on_cpu is racy, but this is ok. ++ * ++ * If we race with it leaving CPU, we'll take a lock. So we're correct. ++ * If we race with it entering CPU, unaccounted time is 0. This is ++ * indistinguishable from the read occurring a few cycles earlier. ++ * If we see ->on_cpu without ->on_rq, the task is leaving, and has ++ * been accounted, so we're correct here as well. ++ */ ++ if (!p->on_cpu || !task_on_rq_queued(p)) ++ return tsk_seruntime(p); ++#endif ++ ++ rq = task_access_lock_irqsave(p, &lock, &flags); ++ /* ++ * Must be ->curr _and_ ->on_rq. If dequeued, we would ++ * project cycles that may never be accounted to this ++ * thread, breaking clock_gettime(). ++ */ ++ if (p == rq->curr && task_on_rq_queued(p)) { ++ update_rq_clock(rq); ++ pds_update_curr(rq, p); ++ } ++ ns = tsk_seruntime(p); ++ task_access_unlock_irqrestore(p, lock, &flags); ++ ++ return ns; ++} ++ ++/* This manages tasks that have run out of timeslice during a scheduler_tick */ ++static inline void pds_scheduler_task_tick(struct rq *rq) ++{ ++ struct task_struct *p = rq->curr; ++ ++ if (is_idle_task(p)) ++ return; ++ ++ pds_update_curr(rq, p); ++ ++ cpufreq_update_util(rq, 0); ++ ++ /* ++ * Tasks that were scheduled in the first half of a tick are not ++ * allowed to run into the 2nd half of the next tick if they will ++ * run out of time slice in the interim. Otherwise, if they have ++ * less than RESCHED_US μs of time slice left they will be rescheduled. ++ */ ++ if (p->time_slice - rq->dither >= RESCHED_US) ++ return; ++ ++ /** ++ * p->time_slice < RESCHED_US. We will modify task_struct under ++ * rq lock as p is rq->curr ++ */ ++ __set_tsk_resched(p); ++} ++ ++#ifdef CONFIG_SMP ++ ++#ifdef CONFIG_SCHED_SMT ++static int active_load_balance_cpu_stop(void *data) ++{ ++ struct rq *rq = this_rq(); ++ struct task_struct *p = data; ++ int cpu; ++ unsigned long flags; ++ ++ local_irq_save(flags); ++ ++ raw_spin_lock(&p->pi_lock); ++ raw_spin_lock(&rq->lock); ++ ++ rq->active_balance = 0; ++ /* ++ * _something_ may have changed the task, double check again ++ */ ++ if (task_on_rq_queued(p) && task_rq(p) == rq && ++ (cpu = cpumask_any_and(&p->cpus_mask, &sched_cpu_sg_idle_mask)) < nr_cpu_ids) ++ rq = __migrate_task(rq, p, cpu); ++ ++ raw_spin_unlock(&rq->lock); ++ raw_spin_unlock(&p->pi_lock); ++ ++ local_irq_restore(flags); ++ ++ return 0; ++} ++ ++/* pds_sg_balance_trigger - trigger slibing group balance for @cpu */ ++static void pds_sg_balance_trigger(const int cpu) ++{ ++ struct rq *rq = cpu_rq(cpu); ++ unsigned long flags; ++ struct task_struct *curr; ++ ++ if (!raw_spin_trylock_irqsave(&rq->lock, flags)) ++ return; ++ curr = rq->curr; ++ if (!is_idle_task(curr) && ++ cpumask_intersects(&curr->cpus_mask, &sched_cpu_sg_idle_mask)) { ++ int active_balance = 0; ++ ++ if (likely(!rq->active_balance)) { ++ rq->active_balance = 1; ++ active_balance = 1; ++ } ++ ++ raw_spin_unlock_irqrestore(&rq->lock, flags); ++ ++ if (likely(active_balance)) ++ stop_one_cpu_nowait(cpu, active_load_balance_cpu_stop, ++ curr, &rq->active_balance_work); ++ } else ++ raw_spin_unlock_irqrestore(&rq->lock, flags); ++} ++ ++/* ++ * pds_sg_balance_check - slibing group balance check for run queue @rq ++ */ ++static inline void pds_sg_balance_check(const struct rq *rq) ++{ ++ cpumask_t chk; ++ int i; ++ ++ /* Only online cpu will do sg balance checking */ ++ if (unlikely(!rq->online)) ++ return; ++ ++ /* Only cpu in slibing idle group will do the checking */ ++ if (!cpumask_test_cpu(cpu_of(rq), &sched_cpu_sg_idle_mask)) ++ return; ++ ++ /* Find potential cpus which can migrate the currently running task */ ++ if (!cpumask_andnot(&chk, &sched_rq_pending_masks[SCHED_RQ_EMPTY], ++ &sched_rq_queued_masks[SCHED_RQ_EMPTY])) ++ return; ++ ++ for_each_cpu(i, &chk) { ++ /* skip the cpu which has idle slibing cpu */ ++ if (cpumask_test_cpu(per_cpu(sched_sibling_cpu, i), ++ &sched_rq_queued_masks[SCHED_RQ_EMPTY])) ++ continue; ++ pds_sg_balance_trigger(i); ++ } ++} ++#endif /* CONFIG_SCHED_SMT */ ++#endif /* CONFIG_SMP */ ++ ++/* ++ * This function gets called by the timer code, with HZ frequency. ++ * We call it with interrupts disabled. ++ */ ++void scheduler_tick(void) ++{ ++ int cpu __maybe_unused = smp_processor_id(); ++ struct rq *rq = cpu_rq(cpu); ++ ++ sched_clock_tick(); ++ ++ raw_spin_lock(&rq->lock); ++ update_rq_clock(rq); ++ ++ pds_scheduler_task_tick(rq); ++ update_sched_rq_queued_masks_normal(rq); ++ calc_global_load_tick(rq); ++ psi_task_tick(rq); ++ ++ rq->last_tick = rq->clock; ++ raw_spin_unlock(&rq->lock); ++ ++ perf_event_task_tick(); ++} ++ ++#ifdef CONFIG_NO_HZ_FULL ++struct tick_work { ++ int cpu; ++ atomic_t state; ++ struct delayed_work work; ++}; ++/* Values for ->state, see diagram below. */ ++#define TICK_SCHED_REMOTE_OFFLINE 0 ++#define TICK_SCHED_REMOTE_OFFLINING 1 ++#define TICK_SCHED_REMOTE_RUNNING 2 ++ ++/* ++ * State diagram for ->state: ++ * ++ * ++ * TICK_SCHED_REMOTE_OFFLINE ++ * | ^ ++ * | | ++ * | | sched_tick_remote() ++ * | | ++ * | | ++ * +--TICK_SCHED_REMOTE_OFFLINING ++ * | ^ ++ * | | ++ * sched_tick_start() | | sched_tick_stop() ++ * | | ++ * V | ++ * TICK_SCHED_REMOTE_RUNNING ++ * ++ * ++ * Other transitions get WARN_ON_ONCE(), except that sched_tick_remote() ++ * and sched_tick_start() are happy to leave the state in RUNNING. ++ */ ++ ++static struct tick_work __percpu *tick_work_cpu; ++ ++static void sched_tick_remote(struct work_struct *work) ++{ ++ struct delayed_work *dwork = to_delayed_work(work); ++ struct tick_work *twork = container_of(dwork, struct tick_work, work); ++ int cpu = twork->cpu; ++ struct rq *rq = cpu_rq(cpu); ++ struct task_struct *curr; ++ unsigned long flags; ++ u64 delta; ++ int os; ++ ++ /* ++ * Handle the tick only if it appears the remote CPU is running in full ++ * dynticks mode. The check is racy by nature, but missing a tick or ++ * having one too much is no big deal because the scheduler tick updates ++ * statistics and checks timeslices in a time-independent way, regardless ++ * of when exactly it is running. ++ */ ++ if (idle_cpu(cpu) || !tick_nohz_tick_stopped_cpu(cpu)) ++ goto out_requeue; ++ ++ raw_spin_lock_irqsave(&rq->lock, flags); ++ curr = rq->curr; ++ ++ if (is_idle_task(curr) || cpu_is_offline(cpu)) ++ goto out_unlock; ++ ++ update_rq_clock(rq); ++ delta = rq_clock_task(rq) - curr->last_ran; ++ ++ /* ++ * Make sure the next tick runs within a reasonable ++ * amount of time. ++ */ ++ WARN_ON_ONCE(delta > (u64)NSEC_PER_SEC * 3); ++ pds_scheduler_task_tick(rq); ++ update_sched_rq_queued_masks_normal(rq); ++ ++out_unlock: ++ raw_spin_unlock_irqrestore(&rq->lock, flags); ++ ++out_requeue: ++ /* ++ * Run the remote tick once per second (1Hz). This arbitrary ++ * frequency is large enough to avoid overload but short enough ++ * to keep scheduler internal stats reasonably up to date. But ++ * first update state to reflect hotplug activity if required. ++ */ ++ os = atomic_fetch_add_unless(&twork->state, -1, TICK_SCHED_REMOTE_RUNNING); ++ WARN_ON_ONCE(os == TICK_SCHED_REMOTE_OFFLINE); ++ if (os == TICK_SCHED_REMOTE_RUNNING) ++ queue_delayed_work(system_unbound_wq, dwork, HZ); ++} ++ ++static void sched_tick_start(int cpu) ++{ ++ int os; ++ struct tick_work *twork; ++ ++ if (housekeeping_cpu(cpu, HK_FLAG_TICK)) ++ return; ++ ++ WARN_ON_ONCE(!tick_work_cpu); ++ ++ twork = per_cpu_ptr(tick_work_cpu, cpu); ++ os = atomic_xchg(&twork->state, TICK_SCHED_REMOTE_RUNNING); ++ WARN_ON_ONCE(os == TICK_SCHED_REMOTE_RUNNING); ++ if (os == TICK_SCHED_REMOTE_OFFLINE) { ++ twork->cpu = cpu; ++ INIT_DELAYED_WORK(&twork->work, sched_tick_remote); ++ queue_delayed_work(system_unbound_wq, &twork->work, HZ); ++ } ++} ++ ++#ifdef CONFIG_HOTPLUG_CPU ++static void sched_tick_stop(int cpu) ++{ ++ struct tick_work *twork; ++ ++ if (housekeeping_cpu(cpu, HK_FLAG_TICK)) ++ return; ++ ++ WARN_ON_ONCE(!tick_work_cpu); ++ ++ twork = per_cpu_ptr(tick_work_cpu, cpu); ++ cancel_delayed_work_sync(&twork->work); ++} ++#endif /* CONFIG_HOTPLUG_CPU */ ++ ++int __init sched_tick_offload_init(void) ++{ ++ tick_work_cpu = alloc_percpu(struct tick_work); ++ BUG_ON(!tick_work_cpu); ++ return 0; ++} ++ ++#else /* !CONFIG_NO_HZ_FULL */ ++static inline void sched_tick_start(int cpu) { } ++static inline void sched_tick_stop(int cpu) { } ++#endif ++ ++#if defined(CONFIG_PREEMPTION) && (defined(CONFIG_DEBUG_PREEMPT) || \ ++ defined(CONFIG_PREEMPT_TRACER)) ++/* ++ * If the value passed in is equal to the current preempt count ++ * then we just disabled preemption. Start timing the latency. ++ */ ++static inline void preempt_latency_start(int val) ++{ ++ if (preempt_count() == val) { ++ unsigned long ip = get_lock_parent_ip(); ++#ifdef CONFIG_DEBUG_PREEMPT ++ current->preempt_disable_ip = ip; ++#endif ++ trace_preempt_off(CALLER_ADDR0, ip); ++ } ++} ++ ++void preempt_count_add(int val) ++{ ++#ifdef CONFIG_DEBUG_PREEMPT ++ /* ++ * Underflow? ++ */ ++ if (DEBUG_LOCKS_WARN_ON((preempt_count() < 0))) ++ return; ++#endif ++ __preempt_count_add(val); ++#ifdef CONFIG_DEBUG_PREEMPT ++ /* ++ * Spinlock count overflowing soon? ++ */ ++ DEBUG_LOCKS_WARN_ON((preempt_count() & PREEMPT_MASK) >= ++ PREEMPT_MASK - 10); ++#endif ++ preempt_latency_start(val); ++} ++EXPORT_SYMBOL(preempt_count_add); ++NOKPROBE_SYMBOL(preempt_count_add); ++ ++/* ++ * If the value passed in equals to the current preempt count ++ * then we just enabled preemption. Stop timing the latency. ++ */ ++static inline void preempt_latency_stop(int val) ++{ ++ if (preempt_count() == val) ++ trace_preempt_on(CALLER_ADDR0, get_lock_parent_ip()); ++} ++ ++void preempt_count_sub(int val) ++{ ++#ifdef CONFIG_DEBUG_PREEMPT ++ /* ++ * Underflow? ++ */ ++ if (DEBUG_LOCKS_WARN_ON(val > preempt_count())) ++ return; ++ /* ++ * Is the spinlock portion underflowing? ++ */ ++ if (DEBUG_LOCKS_WARN_ON((val < PREEMPT_MASK) && ++ !(preempt_count() & PREEMPT_MASK))) ++ return; ++#endif ++ ++ preempt_latency_stop(val); ++ __preempt_count_sub(val); ++} ++EXPORT_SYMBOL(preempt_count_sub); ++NOKPROBE_SYMBOL(preempt_count_sub); ++ ++#else ++static inline void preempt_latency_start(int val) { } ++static inline void preempt_latency_stop(int val) { } ++#endif ++ ++/* ++ * Timeslices below RESCHED_US are considered as good as expired as there's no ++ * point rescheduling when there's so little time left. SCHED_BATCH tasks ++ * have been flagged be not latency sensitive and likely to be fully CPU ++ * bound so every time they're rescheduled they have their time_slice ++ * refilled, but get a new later deadline to have little effect on ++ * SCHED_NORMAL tasks. ++ ++ */ ++static inline void check_deadline(struct task_struct *p, struct rq *rq) ++{ ++ if (rq->idle == p) ++ return; ++ ++ pds_update_curr(rq, p); ++ ++ if (p->time_slice < RESCHED_US) { ++ time_slice_expired(p, rq); ++ if (SCHED_ISO == p->policy && ISO_PRIO == p->prio) { ++ p->prio = NORMAL_PRIO; ++ p->deadline = rq->clock + task_deadline_diff(p); ++ update_task_priodl(p); ++ } ++ if (SCHED_FIFO != p->policy && task_on_rq_queued(p)) ++ requeue_task(p, rq); ++ } ++} ++ ++#ifdef CONFIG_SMP ++ ++#define SCHED_RQ_NR_MIGRATION (32UL) ++/* ++ * Migrate pending tasks in @rq to @dest_cpu ++ * Will try to migrate mininal of half of @rq nr_running tasks and ++ * SCHED_RQ_NR_MIGRATION to @dest_cpu ++ */ ++static inline int ++migrate_pending_tasks(struct rq *rq, struct rq *dest_rq, int filter_prio) ++{ ++ struct task_struct *p; ++ int dest_cpu = cpu_of(dest_rq); ++ int nr_migrated = 0; ++ int nr_tries = min((rq->nr_running + 1) / 2, SCHED_RQ_NR_MIGRATION); ++ struct skiplist_node *node = rq->sl_header.next[0]; ++ ++ while (nr_tries && node != &rq->sl_header) { ++ p = skiplist_entry(node, struct task_struct, sl_node); ++ node = node->next[0]; ++ ++ if (task_running(p)) ++ continue; ++ if (p->prio >= filter_prio) ++ break; ++ if (cpumask_test_cpu(dest_cpu, &p->cpus_mask)) { ++ dequeue_task(p, rq, 0); ++ set_task_cpu(p, dest_cpu); ++ enqueue_task(p, dest_rq, 0); ++ nr_migrated++; ++ } ++ nr_tries--; ++ /* make a jump */ ++ if (node == &rq->sl_header) ++ break; ++ node = node->next[0]; ++ } ++ ++ return nr_migrated; ++} ++ ++static inline int ++take_queued_task_cpumask(struct rq *rq, cpumask_t *chk_mask, int filter_prio) ++{ ++ int src_cpu; ++ ++ for_each_cpu(src_cpu, chk_mask) { ++ int nr_migrated; ++ struct rq *src_rq = cpu_rq(src_cpu); ++ ++ if (!do_raw_spin_trylock(&src_rq->lock)) { ++ if (PRIO_LIMIT == filter_prio) ++ continue; ++ return 0; ++ } ++ spin_acquire(&src_rq->lock.dep_map, SINGLE_DEPTH_NESTING, 1, _RET_IP_); ++ ++ update_rq_clock(src_rq); ++ if ((nr_migrated = migrate_pending_tasks(src_rq, rq, filter_prio))) ++ cpufreq_update_this_cpu(rq, 0); ++ ++ spin_release(&src_rq->lock.dep_map, _RET_IP_); ++ do_raw_spin_unlock(&src_rq->lock); ++ ++ if (nr_migrated || PRIO_LIMIT != filter_prio) ++ return nr_migrated; ++ } ++ return 0; ++} ++ ++static inline int take_other_rq_task(struct rq *rq, int cpu, int filter_prio) ++{ ++ struct cpumask *affinity_mask, *end; ++ struct cpumask chk; ++ ++ if (PRIO_LIMIT == filter_prio) { ++ cpumask_complement(&chk, &sched_rq_pending_masks[SCHED_RQ_EMPTY]); ++#ifdef CONFIG_SMT_NICE ++ { ++ /* also try to take IDLE priority tasks from smt supressed cpu */ ++ struct cpumask t; ++ if (cpumask_and(&t, &sched_smt_supressed_mask, ++ &sched_rq_queued_masks[SCHED_RQ_IDLE])) ++ cpumask_or(&chk, &chk, &t); ++ } ++#endif ++ } else if (NORMAL_PRIO == filter_prio) { ++ cpumask_or(&chk, &sched_rq_pending_masks[SCHED_RQ_RT], ++ &sched_rq_pending_masks[SCHED_RQ_ISO]); ++ } else if (IDLE_PRIO == filter_prio) { ++ cpumask_complement(&chk, &sched_rq_pending_masks[SCHED_RQ_EMPTY]); ++ cpumask_andnot(&chk, &chk, &sched_rq_pending_masks[SCHED_RQ_IDLE]); ++ } else ++ cpumask_copy(&chk, &sched_rq_pending_masks[SCHED_RQ_RT]); ++ ++ if (cpumask_empty(&chk)) ++ return 0; ++ ++ affinity_mask = per_cpu(sched_cpu_llc_start_mask, cpu); ++ end = per_cpu(sched_cpu_affinity_chk_end_masks, cpu); ++ do { ++ struct cpumask tmp; ++ ++ if (cpumask_and(&tmp, &chk, affinity_mask) && ++ take_queued_task_cpumask(rq, &tmp, filter_prio)) ++ return 1; ++ } while (++affinity_mask < end); ++ ++ return 0; ++} ++#endif ++ ++static inline struct task_struct * ++choose_next_task(struct rq *rq, int cpu, struct task_struct *prev) ++{ ++ struct task_struct *next = rq_first_queued_task(rq); ++ ++#ifdef CONFIG_SMT_NICE ++ if (cpumask_test_cpu(cpu, &sched_smt_supressed_mask)) { ++ if (next->prio >= IDLE_PRIO) { ++ if (rq->online && ++ take_other_rq_task(rq, cpu, IDLE_PRIO)) ++ return rq_first_queued_task(rq); ++ return rq->idle; ++ } ++ } ++#endif ++ ++#ifdef CONFIG_SMP ++ if (likely(rq->online)) ++ if (take_other_rq_task(rq, cpu, next->prio)) { ++ resched_curr(rq); ++ return rq_first_queued_task(rq); ++ } ++#endif ++ return next; ++} ++ ++static inline unsigned long get_preempt_disable_ip(struct task_struct *p) ++{ ++#ifdef CONFIG_DEBUG_PREEMPT ++ return p->preempt_disable_ip; ++#else ++ return 0; ++#endif ++} ++ ++/* ++ * Print scheduling while atomic bug: ++ */ ++static noinline void __schedule_bug(struct task_struct *prev) ++{ ++ /* Save this before calling printk(), since that will clobber it */ ++ unsigned long preempt_disable_ip = get_preempt_disable_ip(current); ++ ++ if (oops_in_progress) ++ return; ++ ++ printk(KERN_ERR "BUG: scheduling while atomic: %s/%d/0x%08x\n", ++ prev->comm, prev->pid, preempt_count()); ++ ++ debug_show_held_locks(prev); ++ print_modules(); ++ if (irqs_disabled()) ++ print_irqtrace_events(prev); ++ if (IS_ENABLED(CONFIG_DEBUG_PREEMPT) ++ && in_atomic_preempt_off()) { ++ pr_err("Preemption disabled at:"); ++ print_ip_sym(preempt_disable_ip); ++ pr_cont("\n"); ++ } ++ if (panic_on_warn) ++ panic("scheduling while atomic\n"); ++ ++ dump_stack(); ++ add_taint(TAINT_WARN, LOCKDEP_STILL_OK); ++} ++ ++/* ++ * Various schedule()-time debugging checks and statistics: ++ */ ++static inline void schedule_debug(struct task_struct *prev, bool preempt) ++{ ++#ifdef CONFIG_SCHED_STACK_END_CHECK ++ if (task_stack_end_corrupted(prev)) ++ panic("corrupted stack end detected inside scheduler\n"); ++#endif ++ ++#ifdef CONFIG_DEBUG_ATOMIC_SLEEP ++ if (!preempt && prev->state && prev->non_block_count) { ++ printk(KERN_ERR "BUG: scheduling in a non-blocking section: %s/%d/%i\n", ++ prev->comm, prev->pid, prev->non_block_count); ++ dump_stack(); ++ add_taint(TAINT_WARN, LOCKDEP_STILL_OK); ++ } ++#endif ++ ++ if (unlikely(in_atomic_preempt_off())) { ++ __schedule_bug(prev); ++ preempt_count_set(PREEMPT_DISABLED); ++ } ++ rcu_sleep_check(); ++ ++ profile_hit(SCHED_PROFILING, __builtin_return_address(0)); ++ ++ schedstat_inc(this_rq()->sched_count); ++} ++ ++static inline void set_rq_task(struct rq *rq, struct task_struct *p) ++{ ++ p->last_ran = rq->clock_task; ++ ++#ifdef CONFIG_HIGH_RES_TIMERS ++ if (p != rq->idle) ++ hrtick_start(rq, US_TO_NS(p->time_slice)); ++#endif ++ /* update rq->dither */ ++ rq->dither = rq_dither(rq); ++} ++ ++/* ++ * schedule() is the main scheduler function. ++ * ++ * The main means of driving the scheduler and thus entering this function are: ++ * ++ * 1. Explicit blocking: mutex, semaphore, waitqueue, etc. ++ * ++ * 2. TIF_NEED_RESCHED flag is checked on interrupt and userspace return ++ * paths. For example, see arch/x86/entry_64.S. ++ * ++ * To drive preemption between tasks, the scheduler sets the flag in timer ++ * interrupt handler scheduler_tick(). ++ * ++ * 3. Wakeups don't really cause entry into schedule(). They add a ++ * task to the run-queue and that's it. ++ * ++ * Now, if the new task added to the run-queue preempts the current ++ * task, then the wakeup sets TIF_NEED_RESCHED and schedule() gets ++ * called on the nearest possible occasion: ++ * ++ * - If the kernel is preemptible (CONFIG_PREEMPTION=y): ++ * ++ * - in syscall or exception context, at the next outmost ++ * preempt_enable(). (this might be as soon as the wake_up()'s ++ * spin_unlock()!) ++ * ++ * - in IRQ context, return from interrupt-handler to ++ * preemptible context ++ * ++ * - If the kernel is not preemptible (CONFIG_PREEMPT is not set) ++ * then at the next: ++ * ++ * - cond_resched() call ++ * - explicit schedule() call ++ * - return from syscall or exception to user-space ++ * - return from interrupt-handler to user-space ++ * ++ * WARNING: must be called with preemption disabled! ++ */ ++static void __sched notrace __schedule(bool preempt) ++{ ++ struct task_struct *prev, *next; ++ unsigned long *switch_count; ++ struct rq *rq; ++ int cpu; ++ ++ cpu = smp_processor_id(); ++ rq = cpu_rq(cpu); ++ prev = rq->curr; ++ ++ schedule_debug(prev, preempt); ++ ++ /* by passing sched_feat(HRTICK) checking which PDS doesn't support */ ++ hrtick_clear(rq); ++ ++ local_irq_disable(); ++ rcu_note_context_switch(preempt); ++ ++ /* ++ * Make sure that signal_pending_state()->signal_pending() below ++ * can't be reordered with __set_current_state(TASK_INTERRUPTIBLE) ++ * done by the caller to avoid the race with signal_wake_up(). ++ * ++ * The membarrier system call requires a full memory barrier ++ * after coming from user-space, before storing to rq->curr. ++ */ ++ raw_spin_lock(&rq->lock); ++ smp_mb__after_spinlock(); ++ ++ update_rq_clock(rq); ++ ++ switch_count = &prev->nivcsw; ++ if (!preempt && prev->state) { ++ if (signal_pending_state(prev->state, prev)) { ++ prev->state = TASK_RUNNING; ++ } else { ++ deactivate_task(prev, rq); ++ ++ if (prev->in_iowait) { ++ atomic_inc(&rq->nr_iowait); ++ delayacct_blkio_start(); ++ } ++ } ++ switch_count = &prev->nvcsw; ++ } ++ ++ clear_tsk_need_resched(prev); ++ clear_preempt_need_resched(); ++ ++ check_deadline(prev, rq); ++ ++ next = choose_next_task(rq, cpu, prev); ++ ++ set_rq_task(rq, next); ++ ++ if (prev != next) { ++ if (next->prio == PRIO_LIMIT) ++ schedstat_inc(rq->sched_goidle); ++ ++ /* ++ * RCU users of rcu_dereference(rq->curr) may not see ++ * changes to task_struct made by pick_next_task(). ++ */ ++ RCU_INIT_POINTER(rq->curr, next); ++ /* ++ * The membarrier system call requires each architecture ++ * to have a full memory barrier after updating ++ * rq->curr, before returning to user-space. ++ * ++ * Here are the schemes providing that barrier on the ++ * various architectures: ++ * - mm ? switch_mm() : mmdrop() for x86, s390, sparc, PowerPC. ++ * switch_mm() rely on membarrier_arch_switch_mm() on PowerPC. ++ * - finish_lock_switch() for weakly-ordered ++ * architectures where spin_unlock is a full barrier, ++ * - switch_to() for arm64 (weakly-ordered, spin_unlock ++ * is a RELEASE barrier), ++ */ ++ ++*switch_count; ++ rq->nr_switches++; ++ ++ trace_sched_switch(preempt, prev, next); ++ ++ /* Also unlocks the rq: */ ++ rq = context_switch(rq, prev, next); ++#ifdef CONFIG_SCHED_SMT ++ pds_sg_balance_check(rq); ++#endif ++ } else ++ raw_spin_unlock_irq(&rq->lock); ++} ++ ++void __noreturn do_task_dead(void) ++{ ++ /* Causes final put_task_struct in finish_task_switch(): */ ++ set_special_state(TASK_DEAD); ++ ++ /* Tell freezer to ignore us: */ ++ current->flags |= PF_NOFREEZE; ++ __schedule(false); ++ ++ BUG(); ++ ++ /* Avoid "noreturn function does return" - but don't continue if BUG() is a NOP: */ ++ for (;;) ++ cpu_relax(); ++} ++ ++static inline void sched_submit_work(struct task_struct *tsk) ++{ ++ if (!tsk->state || tsk_is_pi_blocked(tsk) || ++ signal_pending_state(tsk->state, tsk)) ++ return; ++ ++ /* ++ * If a worker went to sleep, notify and ask workqueue whether ++ * it wants to wake up a task to maintain concurrency. ++ * As this function is called inside the schedule() context, ++ * we disable preemption to avoid it calling schedule() again ++ * in the possible wakeup of a kworker. ++ */ ++ if (tsk->flags & (PF_WQ_WORKER | PF_IO_WORKER)) { ++ preempt_disable(); ++ if (tsk->flags & PF_WQ_WORKER) ++ wq_worker_sleeping(tsk); ++ else ++ io_wq_worker_sleeping(tsk); ++ preempt_enable_no_resched(); ++ } ++ ++ /* ++ * If we are going to sleep and we have plugged IO queued, ++ * make sure to submit it to avoid deadlocks. ++ */ ++ if (blk_needs_flush_plug(tsk)) ++ blk_schedule_flush_plug(tsk); ++} ++ ++static void sched_update_worker(struct task_struct *tsk) ++{ ++ if (tsk->flags & (PF_WQ_WORKER | PF_IO_WORKER)) { ++ if (tsk->flags & PF_WQ_WORKER) ++ wq_worker_running(tsk); ++ else ++ io_wq_worker_running(tsk); ++ } ++} ++ ++asmlinkage __visible void __sched schedule(void) ++{ ++ struct task_struct *tsk = current; ++ ++ sched_submit_work(tsk); ++ do { ++ preempt_disable(); ++ __schedule(false); ++ sched_preempt_enable_no_resched(); ++ } while (need_resched()); ++ sched_update_worker(tsk); ++} ++EXPORT_SYMBOL(schedule); ++ ++/* ++ * synchronize_rcu_tasks() makes sure that no task is stuck in preempted ++ * state (have scheduled out non-voluntarily) by making sure that all ++ * tasks have either left the run queue or have gone into user space. ++ * As idle tasks do not do either, they must not ever be preempted ++ * (schedule out non-voluntarily). ++ * ++ * schedule_idle() is similar to schedule_preempt_disable() except that it ++ * never enables preemption because it does not call sched_submit_work(). ++ */ ++void __sched schedule_idle(void) ++{ ++ /* ++ * As this skips calling sched_submit_work(), which the idle task does ++ * regardless because that function is a nop when the task is in a ++ * TASK_RUNNING state, make sure this isn't used someplace that the ++ * current task can be in any other state. Note, idle is always in the ++ * TASK_RUNNING state. ++ */ ++ WARN_ON_ONCE(current->state); ++ do { ++ __schedule(false); ++ } while (need_resched()); ++} ++ ++#ifdef CONFIG_CONTEXT_TRACKING ++asmlinkage __visible void __sched schedule_user(void) ++{ ++ /* ++ * If we come here after a random call to set_need_resched(), ++ * or we have been woken up remotely but the IPI has not yet arrived, ++ * we haven't yet exited the RCU idle mode. Do it here manually until ++ * we find a better solution. ++ * ++ * NB: There are buggy callers of this function. Ideally we ++ * should warn if prev_state != CONTEXT_USER, but that will trigger ++ * too frequently to make sense yet. ++ */ ++ enum ctx_state prev_state = exception_enter(); ++ schedule(); ++ exception_exit(prev_state); ++} ++#endif ++ ++/** ++ * schedule_preempt_disabled - called with preemption disabled ++ * ++ * Returns with preemption disabled. Note: preempt_count must be 1 ++ */ ++void __sched schedule_preempt_disabled(void) ++{ ++ sched_preempt_enable_no_resched(); ++ schedule(); ++ preempt_disable(); ++} ++ ++static void __sched notrace preempt_schedule_common(void) ++{ ++ do { ++ /* ++ * Because the function tracer can trace preempt_count_sub() ++ * and it also uses preempt_enable/disable_notrace(), if ++ * NEED_RESCHED is set, the preempt_enable_notrace() called ++ * by the function tracer will call this function again and ++ * cause infinite recursion. ++ * ++ * Preemption must be disabled here before the function ++ * tracer can trace. Break up preempt_disable() into two ++ * calls. One to disable preemption without fear of being ++ * traced. The other to still record the preemption latency, ++ * which can also be traced by the function tracer. ++ */ ++ preempt_disable_notrace(); ++ preempt_latency_start(1); ++ __schedule(true); ++ preempt_latency_stop(1); ++ preempt_enable_no_resched_notrace(); ++ ++ /* ++ * Check again in case we missed a preemption opportunity ++ * between schedule and now. ++ */ ++ } while (need_resched()); ++} ++ ++#ifdef CONFIG_PREEMPTION ++/* ++ * This is the entry point to schedule() from in-kernel preemption ++ * off of preempt_enable. ++ */ ++asmlinkage __visible void __sched notrace preempt_schedule(void) ++{ ++ /* ++ * If there is a non-zero preempt_count or interrupts are disabled, ++ * we do not want to preempt the current task. Just return.. ++ */ ++ if (likely(!preemptible())) ++ return; ++ ++ preempt_schedule_common(); ++} ++NOKPROBE_SYMBOL(preempt_schedule); ++EXPORT_SYMBOL(preempt_schedule); ++ ++/** ++ * preempt_schedule_notrace - preempt_schedule called by tracing ++ * ++ * The tracing infrastructure uses preempt_enable_notrace to prevent ++ * recursion and tracing preempt enabling caused by the tracing ++ * infrastructure itself. But as tracing can happen in areas coming ++ * from userspace or just about to enter userspace, a preempt enable ++ * can occur before user_exit() is called. This will cause the scheduler ++ * to be called when the system is still in usermode. ++ * ++ * To prevent this, the preempt_enable_notrace will use this function ++ * instead of preempt_schedule() to exit user context if needed before ++ * calling the scheduler. ++ */ ++asmlinkage __visible void __sched notrace preempt_schedule_notrace(void) ++{ ++ enum ctx_state prev_ctx; ++ ++ if (likely(!preemptible())) ++ return; ++ ++ do { ++ /* ++ * Because the function tracer can trace preempt_count_sub() ++ * and it also uses preempt_enable/disable_notrace(), if ++ * NEED_RESCHED is set, the preempt_enable_notrace() called ++ * by the function tracer will call this function again and ++ * cause infinite recursion. ++ * ++ * Preemption must be disabled here before the function ++ * tracer can trace. Break up preempt_disable() into two ++ * calls. One to disable preemption without fear of being ++ * traced. The other to still record the preemption latency, ++ * which can also be traced by the function tracer. ++ */ ++ preempt_disable_notrace(); ++ preempt_latency_start(1); ++ /* ++ * Needs preempt disabled in case user_exit() is traced ++ * and the tracer calls preempt_enable_notrace() causing ++ * an infinite recursion. ++ */ ++ prev_ctx = exception_enter(); ++ __schedule(true); ++ exception_exit(prev_ctx); ++ ++ preempt_latency_stop(1); ++ preempt_enable_no_resched_notrace(); ++ } while (need_resched()); ++} ++EXPORT_SYMBOL_GPL(preempt_schedule_notrace); ++ ++#endif /* CONFIG_PREEMPTION */ ++ ++/* ++ * This is the entry point to schedule() from kernel preemption ++ * off of irq context. ++ * Note, that this is called and return with irqs disabled. This will ++ * protect us against recursive calling from irq. ++ */ ++asmlinkage __visible void __sched preempt_schedule_irq(void) ++{ ++ enum ctx_state prev_state; ++ ++ /* Catch callers which need to be fixed */ ++ BUG_ON(preempt_count() || !irqs_disabled()); ++ ++ prev_state = exception_enter(); ++ ++ do { ++ preempt_disable(); ++ local_irq_enable(); ++ __schedule(true); ++ local_irq_disable(); ++ sched_preempt_enable_no_resched(); ++ } while (need_resched()); ++ ++ exception_exit(prev_state); ++} ++ ++int default_wake_function(wait_queue_entry_t *curr, unsigned mode, int wake_flags, ++ void *key) ++{ ++ return try_to_wake_up(curr->private, mode, wake_flags); ++} ++EXPORT_SYMBOL(default_wake_function); ++ ++static inline void ++check_task_changed(struct rq *rq, struct task_struct *p) ++{ ++ /* ++ * Trigger changes when task priority/deadline modified. ++ */ ++ if (task_on_rq_queued(p)) { ++ struct task_struct *first; ++ ++ requeue_task(p, rq); ++ ++ /* Resched if first queued task not running and not IDLE */ ++ if ((first = rq_first_queued_task(rq)) != rq->curr && ++ !task_running_idle(first)) ++ resched_curr(rq); ++ } ++} ++ ++#ifdef CONFIG_RT_MUTEXES ++ ++static inline int __rt_effective_prio(struct task_struct *pi_task, int prio) ++{ ++ if (pi_task) ++ prio = min(prio, pi_task->prio); ++ ++ return prio; ++} ++ ++static inline int rt_effective_prio(struct task_struct *p, int prio) ++{ ++ struct task_struct *pi_task = rt_mutex_get_top_task(p); ++ ++ return __rt_effective_prio(pi_task, prio); ++} ++ ++/* ++ * rt_mutex_setprio - set the current priority of a task ++ * @p: task to boost ++ * @pi_task: donor task ++ * ++ * This function changes the 'effective' priority of a task. It does ++ * not touch ->normal_prio like __setscheduler(). ++ * ++ * Used by the rt_mutex code to implement priority inheritance ++ * logic. Call site only calls if the priority of the task changed. ++ */ ++void rt_mutex_setprio(struct task_struct *p, struct task_struct *pi_task) ++{ ++ int prio; ++ struct rq *rq; ++ raw_spinlock_t *lock; ++ ++ /* XXX used to be waiter->prio, not waiter->task->prio */ ++ prio = __rt_effective_prio(pi_task, p->normal_prio); ++ ++ /* ++ * If nothing changed; bail early. ++ */ ++ if (p->pi_top_task == pi_task && prio == p->prio) ++ return; ++ ++ rq = __task_access_lock(p, &lock); ++ /* ++ * Set under pi_lock && rq->lock, such that the value can be used under ++ * either lock. ++ * ++ * Note that there is loads of tricky to make this pointer cache work ++ * right. rt_mutex_slowunlock()+rt_mutex_postunlock() work together to ++ * ensure a task is de-boosted (pi_task is set to NULL) before the ++ * task is allowed to run again (and can exit). This ensures the pointer ++ * points to a blocked task -- which guaratees the task is present. ++ */ ++ p->pi_top_task = pi_task; ++ ++ /* ++ * For FIFO/RR we only need to set prio, if that matches we're done. ++ */ ++ if (prio == p->prio) ++ goto out_unlock; ++ ++ /* ++ * Idle task boosting is a nono in general. There is one ++ * exception, when PREEMPT_RT and NOHZ is active: ++ * ++ * The idle task calls get_next_timer_interrupt() and holds ++ * the timer wheel base->lock on the CPU and another CPU wants ++ * to access the timer (probably to cancel it). We can safely ++ * ignore the boosting request, as the idle CPU runs this code ++ * with interrupts disabled and will complete the lock ++ * protected section without being interrupted. So there is no ++ * real need to boost. ++ */ ++ if (unlikely(p == rq->idle)) { ++ WARN_ON(p != rq->curr); ++ WARN_ON(p->pi_blocked_on); ++ goto out_unlock; ++ } ++ ++ trace_sched_pi_setprio(p, pi_task); ++ p->prio = prio; ++ update_task_priodl(p); ++ ++ check_task_changed(rq, p); ++ ++out_unlock: ++ __task_access_unlock(p, lock); ++} ++#else ++static inline int rt_effective_prio(struct task_struct *p, int prio) ++{ ++ return prio; ++} ++#endif ++ ++void set_user_nice(struct task_struct *p, long nice) ++{ ++ int new_static; ++ unsigned long flags; ++ struct rq *rq; ++ raw_spinlock_t *lock; ++ ++ if (task_nice(p) == nice || nice < MIN_NICE || nice > MAX_NICE) ++ return; ++ new_static = NICE_TO_PRIO(nice); ++ /* ++ * We have to be careful, if called from sys_setpriority(), ++ * the task might be in the middle of scheduling on another CPU. ++ */ ++ raw_spin_lock_irqsave(&p->pi_lock, flags); ++ rq = __task_access_lock(p, &lock); ++ ++ /* rq lock may not held!! */ ++ update_rq_clock(rq); ++ ++ p->static_prio = new_static; ++ /* ++ * The RT priorities are set via sched_setscheduler(), but we still ++ * allow the 'normal' nice value to be set - but as expected ++ * it wont have any effect on scheduling until the task is ++ * not SCHED_NORMAL/SCHED_BATCH: ++ */ ++ if (task_has_rt_policy(p)) ++ goto out_unlock; ++ ++ p->deadline -= task_deadline_diff(p); ++ p->deadline += static_deadline_diff(new_static); ++ p->prio = effective_prio(p); ++ update_task_priodl(p); ++ ++ check_task_changed(rq, p); ++out_unlock: ++ __task_access_unlock(p, lock); ++ raw_spin_unlock_irqrestore(&p->pi_lock, flags); ++} ++EXPORT_SYMBOL(set_user_nice); ++ ++/* ++ * can_nice - check if a task can reduce its nice value ++ * @p: task ++ * @nice: nice value ++ */ ++int can_nice(const struct task_struct *p, const int nice) ++{ ++ /* Convert nice value [19,-20] to rlimit style value [1,40] */ ++ int nice_rlim = nice_to_rlimit(nice); ++ ++ return (nice_rlim <= task_rlimit(p, RLIMIT_NICE) || ++ capable(CAP_SYS_NICE)); ++} ++ ++#ifdef __ARCH_WANT_SYS_NICE ++ ++/* ++ * sys_nice - change the priority of the current process. ++ * @increment: priority increment ++ * ++ * sys_setpriority is a more generic, but much slower function that ++ * does similar things. ++ */ ++SYSCALL_DEFINE1(nice, int, increment) ++{ ++ long nice, retval; ++ ++ /* ++ * Setpriority might change our priority at the same moment. ++ * We don't have to worry. Conceptually one call occurs first ++ * and we have a single winner. ++ */ ++ ++ increment = clamp(increment, -NICE_WIDTH, NICE_WIDTH); ++ nice = task_nice(current) + increment; ++ ++ nice = clamp_val(nice, MIN_NICE, MAX_NICE); ++ if (increment < 0 && !can_nice(current, nice)) ++ return -EPERM; ++ ++ retval = security_task_setnice(current, nice); ++ if (retval) ++ return retval; ++ ++ set_user_nice(current, nice); ++ return 0; ++} ++ ++#endif ++ ++/** ++ * task_prio - return the priority value of a given task. ++ * @p: the task in question. ++ * ++ * Return: The priority value as seen by users in /proc. ++ * RT tasks are offset by -100. Normal tasks are centered around 1, value goes ++ * from 0(SCHED_ISO) up to 82 (nice +19 SCHED_IDLE). ++ */ ++int task_prio(const struct task_struct *p) ++{ ++ int level, prio = p->prio - MAX_RT_PRIO; ++ static const int level_to_nice_prio[] = {39, 33, 26, 20, 14, 7, 0, 0}; ++ ++ /* rt tasks */ ++ if (prio <= 0) ++ goto out; ++ ++ preempt_disable(); ++ level = task_deadline_level(p, this_rq()); ++ preempt_enable(); ++ prio += level_to_nice_prio[level]; ++ if (idleprio_task(p)) ++ prio += NICE_WIDTH; ++out: ++ return prio; ++} ++ ++/** ++ * idle_cpu - is a given CPU idle currently? ++ * @cpu: the processor in question. ++ * ++ * Return: 1 if the CPU is currently idle. 0 otherwise. ++ */ ++int idle_cpu(int cpu) ++{ ++ return cpu_curr(cpu) == cpu_rq(cpu)->idle; ++} ++ ++/** ++ * idle_task - return the idle task for a given CPU. ++ * @cpu: the processor in question. ++ * ++ * Return: The idle task for the cpu @cpu. ++ */ ++struct task_struct *idle_task(int cpu) ++{ ++ return cpu_rq(cpu)->idle; ++} ++ ++/** ++ * find_process_by_pid - find a process with a matching PID value. ++ * @pid: the pid in question. ++ * ++ * The task of @pid, if found. %NULL otherwise. ++ */ ++static inline struct task_struct *find_process_by_pid(pid_t pid) ++{ ++ return pid ? find_task_by_vpid(pid) : current; ++} ++ ++#ifdef CONFIG_SMP ++void sched_set_stop_task(int cpu, struct task_struct *stop) ++{ ++ struct sched_param stop_param = { .sched_priority = STOP_PRIO }; ++ struct sched_param start_param = { .sched_priority = 0 }; ++ struct task_struct *old_stop = cpu_rq(cpu)->stop; ++ ++ if (stop) { ++ /* ++ * Make it appear like a SCHED_FIFO task, its something ++ * userspace knows about and won't get confused about. ++ * ++ * Also, it will make PI more or less work without too ++ * much confusion -- but then, stop work should not ++ * rely on PI working anyway. ++ */ ++ sched_setscheduler_nocheck(stop, SCHED_FIFO, &stop_param); ++ } ++ ++ cpu_rq(cpu)->stop = stop; ++ ++ if (old_stop) { ++ /* ++ * Reset it back to a normal scheduling policy so that ++ * it can die in pieces. ++ */ ++ sched_setscheduler_nocheck(old_stop, SCHED_NORMAL, &start_param); ++ } ++} ++ ++/* ++ * Change a given task's CPU affinity. Migrate the thread to a ++ * proper CPU and schedule it away if the CPU it's executing on ++ * is removed from the allowed bitmask. ++ * ++ * NOTE: the caller must have a valid reference to the task, the ++ * task must not exit() & deallocate itself prematurely. The ++ * call is not atomic; no spinlocks may be held. ++ */ ++static int __set_cpus_allowed_ptr(struct task_struct *p, ++ const struct cpumask *new_mask, bool check) ++{ ++ const struct cpumask *cpu_valid_mask = cpu_active_mask; ++ int dest_cpu; ++ unsigned long flags; ++ struct rq *rq; ++ raw_spinlock_t *lock; ++ int ret = 0; ++ ++ raw_spin_lock_irqsave(&p->pi_lock, flags); ++ rq = __task_access_lock(p, &lock); ++ ++ if (p->flags & PF_KTHREAD) { ++ /* ++ * Kernel threads are allowed on online && !active CPUs ++ */ ++ cpu_valid_mask = cpu_online_mask; ++ } ++ ++ /* ++ * Must re-check here, to close a race against __kthread_bind(), ++ * sched_setaffinity() is not guaranteed to observe the flag. ++ */ ++ if (check && (p->flags & PF_NO_SETAFFINITY)) { ++ ret = -EINVAL; ++ goto out; ++ } ++ ++ if (cpumask_equal(&p->cpus_mask, new_mask)) ++ goto out; ++ ++ dest_cpu = cpumask_any_and(cpu_valid_mask, new_mask); ++ if (dest_cpu >= nr_cpu_ids) { ++ ret = -EINVAL; ++ goto out; ++ } ++ ++ do_set_cpus_allowed(p, new_mask); ++ ++ if (p->flags & PF_KTHREAD) { ++ /* ++ * For kernel threads that do indeed end up on online && ++ * !active we want to ensure they are strict per-CPU threads. ++ */ ++ WARN_ON(cpumask_intersects(new_mask, cpu_online_mask) && ++ !cpumask_intersects(new_mask, cpu_active_mask) && ++ p->nr_cpus_allowed != 1); ++ } ++ ++ /* Can the task run on the task's current CPU? If so, we're done */ ++ if (cpumask_test_cpu(task_cpu(p), new_mask)) ++ goto out; ++ ++ if (task_running(p) || p->state == TASK_WAKING) { ++ struct migration_arg arg = { p, dest_cpu }; ++ ++ /* Need help from migration thread: drop lock and wait. */ ++ __task_access_unlock(p, lock); ++ raw_spin_unlock_irqrestore(&p->pi_lock, flags); ++ stop_one_cpu(cpu_of(rq), migration_cpu_stop, &arg); ++ return 0; ++ } ++ if (task_on_rq_queued(p)) { ++ /* ++ * OK, since we're going to drop the lock immediately ++ * afterwards anyway. ++ */ ++ update_rq_clock(rq); ++ rq = move_queued_task(rq, p, dest_cpu); ++ lock = &rq->lock; ++ } ++ ++out: ++ __task_access_unlock(p, lock); ++ raw_spin_unlock_irqrestore(&p->pi_lock, flags); ++ ++ return ret; ++} ++ ++int set_cpus_allowed_ptr(struct task_struct *p, const struct cpumask *new_mask) ++{ ++ return __set_cpus_allowed_ptr(p, new_mask, false); ++} ++EXPORT_SYMBOL_GPL(set_cpus_allowed_ptr); ++ ++#else ++static inline int ++__set_cpus_allowed_ptr(struct task_struct *p, ++ const struct cpumask *new_mask, bool check) ++{ ++ return set_cpus_allowed_ptr(p, new_mask); ++} ++#endif ++ ++static u64 task_init_deadline(const struct task_struct *p) ++{ ++ return task_rq(p)->clock + task_deadline_diff(p); ++} ++ ++u64 (* task_init_deadline_func_tbl[])(const struct task_struct *p) = { ++ task_init_deadline, /* SCHED_NORMAL */ ++ NULL, /* SCHED_FIFO */ ++ NULL, /* SCHED_RR */ ++ task_init_deadline, /* SCHED_BATCH */ ++ NULL, /* SCHED_ISO */ ++ task_init_deadline /* SCHED_IDLE */ ++}; ++ ++/* ++ * sched_setparam() passes in -1 for its policy, to let the functions ++ * it calls know not to change it. ++ */ ++#define SETPARAM_POLICY -1 ++ ++static void __setscheduler_params(struct task_struct *p, ++ const struct sched_attr *attr) ++{ ++ int old_policy = p->policy; ++ int policy = attr->sched_policy; ++ ++ if (policy == SETPARAM_POLICY) ++ policy = p->policy; ++ ++ p->policy = policy; ++ ++ /* ++ * allow normal nice value to be set, but will not have any ++ * effect on scheduling until the task not SCHED_NORMAL/ ++ * SCHED_BATCH ++ */ ++ p->static_prio = NICE_TO_PRIO(attr->sched_nice); ++ ++ /* ++ * __sched_setscheduler() ensures attr->sched_priority == 0 when ++ * !rt_policy. Always setting this ensures that things like ++ * getparam()/getattr() don't report silly values for !rt tasks. ++ */ ++ p->rt_priority = attr->sched_priority; ++ p->normal_prio = normal_prio(p); ++ ++ if (old_policy != policy) ++ p->deadline = (task_init_deadline_func_tbl[p->policy])? ++ task_init_deadline_func_tbl[p->policy](p):0ULL; ++} ++ ++/* Actually do priority change: must hold rq lock. */ ++static void __setscheduler(struct rq *rq, struct task_struct *p, ++ const struct sched_attr *attr, bool keep_boost) ++{ ++ __setscheduler_params(p, attr); ++ ++ /* ++ * Keep a potential priority boosting if called from ++ * sched_setscheduler(). ++ */ ++ p->prio = normal_prio(p); ++ if (keep_boost) ++ p->prio = rt_effective_prio(p, p->prio); ++ update_task_priodl(p); ++} ++ ++/* ++ * check the target process has a UID that matches the current process's ++ */ ++static bool check_same_owner(struct task_struct *p) ++{ ++ const struct cred *cred = current_cred(), *pcred; ++ bool match; ++ ++ rcu_read_lock(); ++ pcred = __task_cred(p); ++ match = (uid_eq(cred->euid, pcred->euid) || ++ uid_eq(cred->euid, pcred->uid)); ++ rcu_read_unlock(); ++ return match; ++} ++ ++static int ++__sched_setscheduler(struct task_struct *p, ++ const struct sched_attr *attr, bool user, bool pi) ++{ ++ const struct sched_attr dl_squash_attr = { ++ .size = sizeof(struct sched_attr), ++ .sched_policy = SCHED_FIFO, ++ .sched_nice = 0, ++ .sched_priority = 99, ++ }; ++ int newprio = MAX_RT_PRIO - 1 - attr->sched_priority; ++ int retval, oldpolicy = -1; ++ int policy = attr->sched_policy; ++ unsigned long flags; ++ struct rq *rq; ++ int reset_on_fork; ++ raw_spinlock_t *lock; ++ ++ /* The pi code expects interrupts enabled */ ++ BUG_ON(pi && in_interrupt()); ++ ++ /* ++ * PDS supports SCHED_DEADLINE by squash it as prio 0 SCHED_FIFO ++ */ ++ if (unlikely(SCHED_DEADLINE == policy)) { ++ attr = &dl_squash_attr; ++ policy = attr->sched_policy; ++ newprio = MAX_RT_PRIO - 1 - attr->sched_priority; ++ } ++recheck: ++ /* Double check policy once rq lock held */ ++ if (policy < 0) { ++ reset_on_fork = p->sched_reset_on_fork; ++ policy = oldpolicy = p->policy; ++ } else { ++ reset_on_fork = !!(attr->sched_flags & SCHED_RESET_ON_FORK); ++ ++ if (policy > SCHED_IDLE) ++ return -EINVAL; ++ } ++ ++ if (attr->sched_flags & ~(SCHED_FLAG_ALL)) ++ return -EINVAL; ++ ++ /* ++ * Valid priorities for SCHED_FIFO and SCHED_RR are ++ * 1..MAX_USER_RT_PRIO-1, valid priority for SCHED_NORMAL and ++ * SCHED_BATCH and SCHED_IDLE is 0. ++ */ ++ if (attr->sched_priority < 0 || ++ (p->mm && attr->sched_priority > MAX_USER_RT_PRIO - 1) || ++ (!p->mm && attr->sched_priority > MAX_RT_PRIO - 1)) ++ return -EINVAL; ++ if ((SCHED_RR == policy || SCHED_FIFO == policy) != ++ (attr->sched_priority != 0)) ++ return -EINVAL; ++ ++ /* ++ * Allow unprivileged RT tasks to decrease priority: ++ */ ++ if (user && !capable(CAP_SYS_NICE)) { ++ if (SCHED_FIFO == policy || SCHED_RR == policy) { ++ unsigned long rlim_rtprio = ++ task_rlimit(p, RLIMIT_RTPRIO); ++ ++ /* Can't set/change the rt policy */ ++ if (policy != p->policy && !rlim_rtprio) ++ return -EPERM; ++ ++ /* Can't increase priority */ ++ if (attr->sched_priority > p->rt_priority && ++ attr->sched_priority > rlim_rtprio) ++ return -EPERM; ++ } ++ ++ /* Can't change other user's priorities */ ++ if (!check_same_owner(p)) ++ return -EPERM; ++ ++ /* Normal users shall not reset the sched_reset_on_fork flag */ ++ if (p->sched_reset_on_fork && !reset_on_fork) ++ return -EPERM; ++ } ++ ++ if (user) { ++ retval = security_task_setscheduler(p); ++ if (retval) ++ return retval; ++ } ++ ++ if (pi) ++ cpuset_read_lock(); ++ ++ /* ++ * Make sure no PI-waiters arrive (or leave) while we are ++ * changing the priority of the task: ++ */ ++ raw_spin_lock_irqsave(&p->pi_lock, flags); ++ ++ /* ++ * To be able to change p->policy safely, task_access_lock() ++ * must be called. ++ * IF use task_access_lock() here: ++ * For the task p which is not running, reading rq->stop is ++ * racy but acceptable as ->stop doesn't change much. ++ * An enhancemnet can be made to read rq->stop saftly. ++ */ ++ rq = __task_access_lock(p, &lock); ++ ++ /* ++ * Changing the policy of the stop threads its a very bad idea ++ */ ++ if (p == rq->stop) { ++ retval = -EINVAL; ++ goto unlock; ++ } ++ ++ /* ++ * If not changing anything there's no need to proceed further: ++ */ ++ if (unlikely(policy == p->policy)) { ++ if (rt_policy(policy) && attr->sched_priority != p->rt_priority) ++ goto change; ++ if (!rt_policy(policy) && ++ NICE_TO_PRIO(attr->sched_nice) != p->static_prio) ++ goto change; ++ ++ p->sched_reset_on_fork = reset_on_fork; ++ retval = 0; ++ goto unlock; ++ } ++change: ++ ++ /* Re-check policy now with rq lock held */ ++ if (unlikely(oldpolicy != -1 && oldpolicy != p->policy)) { ++ policy = oldpolicy = -1; ++ __task_access_unlock(p, lock); ++ raw_spin_unlock_irqrestore(&p->pi_lock, flags); ++ if (pi) ++ cpuset_read_unlock(); ++ goto recheck; ++ } ++ ++ p->sched_reset_on_fork = reset_on_fork; ++ ++ if (pi) { ++ /* ++ * Take priority boosted tasks into account. If the new ++ * effective priority is unchanged, we just store the new ++ * normal parameters and do not touch the scheduler class and ++ * the runqueue. This will be done when the task deboost ++ * itself. ++ */ ++ if (rt_effective_prio(p, newprio) == p->prio) { ++ __setscheduler_params(p, attr); ++ retval = 0; ++ goto unlock; ++ } ++ } ++ ++ __setscheduler(rq, p, attr, pi); ++ ++ check_task_changed(rq, p); ++ ++ /* Avoid rq from going away on us: */ ++ preempt_disable(); ++ __task_access_unlock(p, lock); ++ raw_spin_unlock_irqrestore(&p->pi_lock, flags); ++ ++ if (pi) { ++ cpuset_read_unlock(); ++ rt_mutex_adjust_pi(p); ++ } ++ ++ preempt_enable(); ++ ++ return 0; ++ ++unlock: ++ __task_access_unlock(p, lock); ++ raw_spin_unlock_irqrestore(&p->pi_lock, flags); ++ if (pi) ++ cpuset_read_unlock(); ++ return retval; ++} ++ ++static int _sched_setscheduler(struct task_struct *p, int policy, ++ const struct sched_param *param, bool check) ++{ ++ struct sched_attr attr = { ++ .sched_policy = policy, ++ .sched_priority = param->sched_priority, ++ .sched_nice = PRIO_TO_NICE(p->static_prio), ++ }; ++ ++ /* Fixup the legacy SCHED_RESET_ON_FORK hack. */ ++ if ((policy != SETPARAM_POLICY) && (policy & SCHED_RESET_ON_FORK)) { ++ attr.sched_flags |= SCHED_FLAG_RESET_ON_FORK; ++ policy &= ~SCHED_RESET_ON_FORK; ++ attr.sched_policy = policy; ++ } ++ ++ return __sched_setscheduler(p, &attr, check, true); ++} ++ ++/** ++ * sched_setscheduler - change the scheduling policy and/or RT priority of a thread. ++ * @p: the task in question. ++ * @policy: new policy. ++ * @param: structure containing the new RT priority. ++ * ++ * Return: 0 on success. An error code otherwise. ++ * ++ * NOTE that the task may be already dead. ++ */ ++int sched_setscheduler(struct task_struct *p, int policy, ++ const struct sched_param *param) ++{ ++ return _sched_setscheduler(p, policy, param, true); ++} ++ ++EXPORT_SYMBOL_GPL(sched_setscheduler); ++ ++int sched_setattr(struct task_struct *p, const struct sched_attr *attr) ++{ ++ return __sched_setscheduler(p, attr, true, true); ++} ++EXPORT_SYMBOL_GPL(sched_setattr); ++ ++int sched_setattr_nocheck(struct task_struct *p, const struct sched_attr *attr) ++{ ++ return __sched_setscheduler(p, attr, false, true); ++} ++ ++/** ++ * sched_setscheduler_nocheck - change the scheduling policy and/or RT priority of a thread from kernelspace. ++ * @p: the task in question. ++ * @policy: new policy. ++ * @param: structure containing the new RT priority. ++ * ++ * Just like sched_setscheduler, only don't bother checking if the ++ * current context has permission. For example, this is needed in ++ * stop_machine(): we create temporary high priority worker threads, ++ * but our caller might not have that capability. ++ * ++ * Return: 0 on success. An error code otherwise. ++ */ ++int sched_setscheduler_nocheck(struct task_struct *p, int policy, ++ const struct sched_param *param) ++{ ++ return _sched_setscheduler(p, policy, param, false); ++} ++EXPORT_SYMBOL_GPL(sched_setscheduler_nocheck); ++ ++static int ++do_sched_setscheduler(pid_t pid, int policy, struct sched_param __user *param) ++{ ++ struct sched_param lparam; ++ struct task_struct *p; ++ int retval; ++ ++ if (!param || pid < 0) ++ return -EINVAL; ++ if (copy_from_user(&lparam, param, sizeof(struct sched_param))) ++ return -EFAULT; ++ ++ rcu_read_lock(); ++ retval = -ESRCH; ++ p = find_process_by_pid(pid); ++ if (likely(p)) ++ get_task_struct(p); ++ rcu_read_unlock(); ++ ++ if (likely(p)) { ++ retval = sched_setscheduler(p, policy, &lparam); ++ put_task_struct(p); ++ } ++ ++ return retval; ++} ++ ++/* ++ * Mimics kernel/events/core.c perf_copy_attr(). ++ */ ++static int sched_copy_attr(struct sched_attr __user *uattr, struct sched_attr *attr) ++{ ++ u32 size; ++ int ret; ++ ++ /* Zero the full structure, so that a short copy will be nice: */ ++ memset(attr, 0, sizeof(*attr)); ++ ++ ret = get_user(size, &uattr->size); ++ if (ret) ++ return ret; ++ ++ /* ABI compatibility quirk: */ ++ if (!size) ++ size = SCHED_ATTR_SIZE_VER0; ++ ++ if (size < SCHED_ATTR_SIZE_VER0 || size > PAGE_SIZE) ++ goto err_size; ++ ++ ret = copy_struct_from_user(attr, sizeof(*attr), uattr, size); ++ if (ret) { ++ if (ret == -E2BIG) ++ goto err_size; ++ return ret; ++ } ++ ++ /* ++ * XXX: Do we want to be lenient like existing syscalls; or do we want ++ * to be strict and return an error on out-of-bounds values? ++ */ ++ attr->sched_nice = clamp(attr->sched_nice, -20, 19); ++ ++ /* sched/core.c uses zero here but we already know ret is zero */ ++ return 0; ++ ++err_size: ++ put_user(sizeof(*attr), &uattr->size); ++ return -E2BIG; ++} ++ ++/** ++ * sys_sched_setscheduler - set/change the scheduler policy and RT priority ++ * @pid: the pid in question. ++ * @policy: new policy. ++ * ++ * Return: 0 on success. An error code otherwise. ++ * @param: structure containing the new RT priority. ++ */ ++SYSCALL_DEFINE3(sched_setscheduler, pid_t, pid, int, policy, struct sched_param __user *, param) ++{ ++ if (policy < 0) ++ return -EINVAL; ++ ++ return do_sched_setscheduler(pid, policy, param); ++} ++ ++/** ++ * sys_sched_setparam - set/change the RT priority of a thread ++ * @pid: the pid in question. ++ * @param: structure containing the new RT priority. ++ * ++ * Return: 0 on success. An error code otherwise. ++ */ ++SYSCALL_DEFINE2(sched_setparam, pid_t, pid, struct sched_param __user *, param) ++{ ++ return do_sched_setscheduler(pid, SETPARAM_POLICY, param); ++} ++ ++/** ++ * sys_sched_setattr - same as above, but with extended sched_attr ++ * @pid: the pid in question. ++ * @uattr: structure containing the extended parameters. ++ */ ++SYSCALL_DEFINE3(sched_setattr, pid_t, pid, struct sched_attr __user *, uattr, ++ unsigned int, flags) ++{ ++ struct sched_attr attr; ++ struct task_struct *p; ++ int retval; ++ ++ if (!uattr || pid < 0 || flags) ++ return -EINVAL; ++ ++ retval = sched_copy_attr(uattr, &attr); ++ if (retval) ++ return retval; ++ ++ if ((int)attr.sched_policy < 0) ++ return -EINVAL; ++ ++ rcu_read_lock(); ++ retval = -ESRCH; ++ p = find_process_by_pid(pid); ++ if (p != NULL) ++ retval = sched_setattr(p, &attr); ++ rcu_read_unlock(); ++ ++ return retval; ++} ++ ++/** ++ * sys_sched_getscheduler - get the policy (scheduling class) of a thread ++ * @pid: the pid in question. ++ * ++ * Return: On success, the policy of the thread. Otherwise, a negative error ++ * code. ++ */ ++SYSCALL_DEFINE1(sched_getscheduler, pid_t, pid) ++{ ++ struct task_struct *p; ++ int retval = -EINVAL; ++ ++ if (pid < 0) ++ goto out_nounlock; ++ ++ retval = -ESRCH; ++ rcu_read_lock(); ++ p = find_process_by_pid(pid); ++ if (p) { ++ retval = security_task_getscheduler(p); ++ if (!retval) ++ retval = p->policy; ++ } ++ rcu_read_unlock(); ++ ++out_nounlock: ++ return retval; ++} ++ ++/** ++ * sys_sched_getscheduler - get the RT priority of a thread ++ * @pid: the pid in question. ++ * @param: structure containing the RT priority. ++ * ++ * Return: On success, 0 and the RT priority is in @param. Otherwise, an error ++ * code. ++ */ ++SYSCALL_DEFINE2(sched_getparam, pid_t, pid, struct sched_param __user *, param) ++{ ++ struct sched_param lp = { .sched_priority = 0 }; ++ struct task_struct *p; ++ int retval = -EINVAL; ++ ++ if (!param || pid < 0) ++ goto out_nounlock; ++ ++ rcu_read_lock(); ++ p = find_process_by_pid(pid); ++ retval = -ESRCH; ++ if (!p) ++ goto out_unlock; ++ ++ retval = security_task_getscheduler(p); ++ if (retval) ++ goto out_unlock; ++ ++ if (task_has_rt_policy(p)) ++ lp.sched_priority = p->rt_priority; ++ rcu_read_unlock(); ++ ++ /* ++ * This one might sleep, we cannot do it with a spinlock held ... ++ */ ++ retval = copy_to_user(param, &lp, sizeof(*param)) ? -EFAULT : 0; ++ ++out_nounlock: ++ return retval; ++ ++out_unlock: ++ rcu_read_unlock(); ++ return retval; ++} ++ ++/* ++ * Copy the kernel size attribute structure (which might be larger ++ * than what user-space knows about) to user-space. ++ * ++ * Note that all cases are valid: user-space buffer can be larger or ++ * smaller than the kernel-space buffer. The usual case is that both ++ * have the same size. ++ */ ++static int ++sched_attr_copy_to_user(struct sched_attr __user *uattr, ++ struct sched_attr *kattr, ++ unsigned int usize) ++{ ++ unsigned int ksize = sizeof(*kattr); ++ ++ if (!access_ok(uattr, usize)) ++ return -EFAULT; ++ ++ /* ++ * sched_getattr() ABI forwards and backwards compatibility: ++ * ++ * If usize == ksize then we just copy everything to user-space and all is good. ++ * ++ * If usize < ksize then we only copy as much as user-space has space for, ++ * this keeps ABI compatibility as well. We skip the rest. ++ * ++ * If usize > ksize then user-space is using a newer version of the ABI, ++ * which part the kernel doesn't know about. Just ignore it - tooling can ++ * detect the kernel's knowledge of attributes from the attr->size value ++ * which is set to ksize in this case. ++ */ ++ kattr->size = min(usize, ksize); ++ ++ if (copy_to_user(uattr, kattr, kattr->size)) ++ return -EFAULT; ++ ++ return 0; ++} ++ ++/** ++ * sys_sched_getattr - similar to sched_getparam, but with sched_attr ++ * @pid: the pid in question. ++ * @uattr: structure containing the extended parameters. ++ * @usize: sizeof(attr) for fwd/bwd comp. ++ * @flags: for future extension. ++ */ ++SYSCALL_DEFINE4(sched_getattr, pid_t, pid, struct sched_attr __user *, uattr, ++ unsigned int, usize, unsigned int, flags) ++{ ++ struct sched_attr kattr = { }; ++ struct task_struct *p; ++ int retval; ++ ++ if (!uattr || pid < 0 || usize > PAGE_SIZE || ++ usize < SCHED_ATTR_SIZE_VER0 || flags) ++ return -EINVAL; ++ ++ rcu_read_lock(); ++ p = find_process_by_pid(pid); ++ retval = -ESRCH; ++ if (!p) ++ goto out_unlock; ++ ++ retval = security_task_getscheduler(p); ++ if (retval) ++ goto out_unlock; ++ ++ kattr.sched_policy = p->policy; ++ if (rt_task(p)) ++ kattr.sched_priority = p->rt_priority; ++ else ++ kattr.sched_nice = task_nice(p); ++ ++#ifdef CONFIG_UCLAMP_TASK ++ kattr.sched_util_min = p->uclamp_req[UCLAMP_MIN].value; ++ kattr.sched_util_max = p->uclamp_req[UCLAMP_MAX].value; ++#endif ++ ++ rcu_read_unlock(); ++ ++ return sched_attr_copy_to_user(uattr, &kattr, usize); ++ ++out_unlock: ++ rcu_read_unlock(); ++ return retval; ++} ++ ++long sched_setaffinity(pid_t pid, const struct cpumask *in_mask) ++{ ++ cpumask_var_t cpus_mask, new_mask; ++ struct task_struct *p; ++ int retval; ++ ++ get_online_cpus(); ++ rcu_read_lock(); ++ ++ p = find_process_by_pid(pid); ++ if (!p) { ++ rcu_read_unlock(); ++ put_online_cpus(); ++ return -ESRCH; ++ } ++ ++ /* Prevent p going away */ ++ get_task_struct(p); ++ rcu_read_unlock(); ++ ++ if (p->flags & PF_NO_SETAFFINITY) { ++ retval = -EINVAL; ++ goto out_put_task; ++ } ++ if (!alloc_cpumask_var(&cpus_mask, GFP_KERNEL)) { ++ retval = -ENOMEM; ++ goto out_put_task; ++ } ++ if (!alloc_cpumask_var(&new_mask, GFP_KERNEL)) { ++ retval = -ENOMEM; ++ goto out_free_cpus_allowed; ++ } ++ retval = -EPERM; ++ if (!check_same_owner(p)) { ++ rcu_read_lock(); ++ if (!ns_capable(__task_cred(p)->user_ns, CAP_SYS_NICE)) { ++ rcu_read_unlock(); ++ goto out_unlock; ++ } ++ rcu_read_unlock(); ++ } ++ ++ retval = security_task_setscheduler(p); ++ if (retval) ++ goto out_unlock; ++ ++ cpuset_cpus_allowed(p, cpus_mask); ++ cpumask_and(new_mask, in_mask, cpus_mask); ++again: ++ retval = __set_cpus_allowed_ptr(p, new_mask, true); ++ ++ if (!retval) { ++ cpuset_cpus_allowed(p, cpus_mask); ++ if (!cpumask_subset(new_mask, cpus_mask)) { ++ /* ++ * We must have raced with a concurrent cpuset ++ * update. Just reset the cpus_mask to the ++ * cpuset's cpus_mask ++ */ ++ cpumask_copy(new_mask, cpus_mask); ++ goto again; ++ } ++ } ++out_unlock: ++ free_cpumask_var(new_mask); ++out_free_cpus_allowed: ++ free_cpumask_var(cpus_mask); ++out_put_task: ++ put_task_struct(p); ++ put_online_cpus(); ++ return retval; ++} ++ ++static int get_user_cpu_mask(unsigned long __user *user_mask_ptr, unsigned len, ++ struct cpumask *new_mask) ++{ ++ if (len < cpumask_size()) ++ cpumask_clear(new_mask); ++ else if (len > cpumask_size()) ++ len = cpumask_size(); ++ ++ return copy_from_user(new_mask, user_mask_ptr, len) ? -EFAULT : 0; ++} ++ ++/** ++ * sys_sched_setaffinity - set the CPU affinity of a process ++ * @pid: pid of the process ++ * @len: length in bytes of the bitmask pointed to by user_mask_ptr ++ * @user_mask_ptr: user-space pointer to the new CPU mask ++ * ++ * Return: 0 on success. An error code otherwise. ++ */ ++SYSCALL_DEFINE3(sched_setaffinity, pid_t, pid, unsigned int, len, ++ unsigned long __user *, user_mask_ptr) ++{ ++ cpumask_var_t new_mask; ++ int retval; ++ ++ if (!alloc_cpumask_var(&new_mask, GFP_KERNEL)) ++ return -ENOMEM; ++ ++ retval = get_user_cpu_mask(user_mask_ptr, len, new_mask); ++ if (retval == 0) ++ retval = sched_setaffinity(pid, new_mask); ++ free_cpumask_var(new_mask); ++ return retval; ++} ++ ++long sched_getaffinity(pid_t pid, cpumask_t *mask) ++{ ++ struct task_struct *p; ++ raw_spinlock_t *lock; ++ unsigned long flags; ++ int retval; ++ ++ rcu_read_lock(); ++ ++ retval = -ESRCH; ++ p = find_process_by_pid(pid); ++ if (!p) ++ goto out_unlock; ++ ++ retval = security_task_getscheduler(p); ++ if (retval) ++ goto out_unlock; ++ ++ task_access_lock_irqsave(p, &lock, &flags); ++ cpumask_and(mask, &p->cpus_mask, cpu_active_mask); ++ task_access_unlock_irqrestore(p, lock, &flags); ++ ++out_unlock: ++ rcu_read_unlock(); ++ ++ return retval; ++} ++ ++/** ++ * sys_sched_getaffinity - get the CPU affinity of a process ++ * @pid: pid of the process ++ * @len: length in bytes of the bitmask pointed to by user_mask_ptr ++ * @user_mask_ptr: user-space pointer to hold the current CPU mask ++ * ++ * Return: size of CPU mask copied to user_mask_ptr on success. An ++ * error code otherwise. ++ */ ++SYSCALL_DEFINE3(sched_getaffinity, pid_t, pid, unsigned int, len, ++ unsigned long __user *, user_mask_ptr) ++{ ++ int ret; ++ cpumask_var_t mask; ++ ++ if ((len * BITS_PER_BYTE) < nr_cpu_ids) ++ return -EINVAL; ++ if (len & (sizeof(unsigned long)-1)) ++ return -EINVAL; ++ ++ if (!alloc_cpumask_var(&mask, GFP_KERNEL)) ++ return -ENOMEM; ++ ++ ret = sched_getaffinity(pid, mask); ++ if (ret == 0) { ++ unsigned int retlen = min_t(size_t, len, cpumask_size()); ++ ++ if (copy_to_user(user_mask_ptr, mask, retlen)) ++ ret = -EFAULT; ++ else ++ ret = retlen; ++ } ++ free_cpumask_var(mask); ++ ++ return ret; ++} ++ ++/** ++ * sys_sched_yield - yield the current processor to other threads. ++ * ++ * This function yields the current CPU to other tasks. It does this by ++ * scheduling away the current task. If it still has the earliest deadline ++ * it will be scheduled again as the next task. ++ * ++ * Return: 0. ++ */ ++static void do_sched_yield(void) ++{ ++ struct rq *rq; ++ struct rq_flags rf; ++ ++ if (!sched_yield_type) ++ return; ++ ++ rq = this_rq_lock_irq(&rf); ++ ++ if (sched_yield_type > 1) { ++ time_slice_expired(current, rq); ++ requeue_task(current, rq); ++ } ++ schedstat_inc(rq->yld_count); ++ ++ /* ++ * Since we are going to call schedule() anyway, there's ++ * no need to preempt or enable interrupts: ++ */ ++ preempt_disable(); ++ raw_spin_unlock(&rq->lock); ++ sched_preempt_enable_no_resched(); ++ ++ schedule(); ++} ++ ++SYSCALL_DEFINE0(sched_yield) ++{ ++ do_sched_yield(); ++ return 0; ++} ++ ++#ifndef CONFIG_PREEMPTION ++int __sched _cond_resched(void) ++{ ++ if (should_resched(0)) { ++ preempt_schedule_common(); ++ return 1; ++ } ++ rcu_all_qs(); ++ return 0; ++} ++EXPORT_SYMBOL(_cond_resched); ++#endif ++ ++/* ++ * __cond_resched_lock() - if a reschedule is pending, drop the given lock, ++ * call schedule, and on return reacquire the lock. ++ * ++ * This works OK both with and without CONFIG_PREEMPTION. We do strange low-level ++ * operations here to prevent schedule() from being called twice (once via ++ * spin_unlock(), once by hand). ++ */ ++int __cond_resched_lock(spinlock_t *lock) ++{ ++ int resched = should_resched(PREEMPT_LOCK_OFFSET); ++ int ret = 0; ++ ++ lockdep_assert_held(lock); ++ ++ if (spin_needbreak(lock) || resched) { ++ spin_unlock(lock); ++ if (resched) ++ preempt_schedule_common(); ++ else ++ cpu_relax(); ++ ret = 1; ++ spin_lock(lock); ++ } ++ return ret; ++} ++EXPORT_SYMBOL(__cond_resched_lock); ++ ++/** ++ * yield - yield the current processor to other threads. ++ * ++ * Do not ever use this function, there's a 99% chance you're doing it wrong. ++ * ++ * The scheduler is at all times free to pick the calling task as the most ++ * eligible task to run, if removing the yield() call from your code breaks ++ * it, its already broken. ++ * ++ * Typical broken usage is: ++ * ++ * while (!event) ++ * yield(); ++ * ++ * where one assumes that yield() will let 'the other' process run that will ++ * make event true. If the current task is a SCHED_FIFO task that will never ++ * happen. Never use yield() as a progress guarantee!! ++ * ++ * If you want to use yield() to wait for something, use wait_event(). ++ * If you want to use yield() to be 'nice' for others, use cond_resched(). ++ * If you still want to use yield(), do not! ++ */ ++void __sched yield(void) ++{ ++ set_current_state(TASK_RUNNING); ++ do_sched_yield(); ++} ++EXPORT_SYMBOL(yield); ++ ++/** ++ * yield_to - yield the current processor to another thread in ++ * your thread group, or accelerate that thread toward the ++ * processor it's on. ++ * @p: target task ++ * @preempt: whether task preemption is allowed or not ++ * ++ * It's the caller's job to ensure that the target task struct ++ * can't go away on us before we can do any checks. ++ * ++ * In PDS, yield_to is not supported. ++ * ++ * Return: ++ * true (>0) if we indeed boosted the target task. ++ * false (0) if we failed to boost the target. ++ * -ESRCH if there's no task to yield to. ++ */ ++int __sched yield_to(struct task_struct *p, bool preempt) ++{ ++ return 0; ++} ++EXPORT_SYMBOL_GPL(yield_to); ++ ++int io_schedule_prepare(void) ++{ ++ int old_iowait = current->in_iowait; ++ ++ current->in_iowait = 1; ++ blk_schedule_flush_plug(current); ++ ++ return old_iowait; ++} ++ ++void io_schedule_finish(int token) ++{ ++ current->in_iowait = token; ++} ++ ++/* ++ * This task is about to go to sleep on IO. Increment rq->nr_iowait so ++ * that process accounting knows that this is a task in IO wait state. ++ * ++ * But don't do that if it is a deliberate, throttling IO wait (this task ++ * has set its backing_dev_info: the queue against which it should throttle) ++ */ ++ ++long __sched io_schedule_timeout(long timeout) ++{ ++ int token; ++ long ret; ++ ++ token = io_schedule_prepare(); ++ ret = schedule_timeout(timeout); ++ io_schedule_finish(token); ++ ++ return ret; ++} ++EXPORT_SYMBOL(io_schedule_timeout); ++ ++void io_schedule(void) ++{ ++ int token; ++ ++ token = io_schedule_prepare(); ++ schedule(); ++ io_schedule_finish(token); ++} ++EXPORT_SYMBOL(io_schedule); ++ ++/** ++ * sys_sched_get_priority_max - return maximum RT priority. ++ * @policy: scheduling class. ++ * ++ * Return: On success, this syscall returns the maximum ++ * rt_priority that can be used by a given scheduling class. ++ * On failure, a negative error code is returned. ++ */ ++SYSCALL_DEFINE1(sched_get_priority_max, int, policy) ++{ ++ int ret = -EINVAL; ++ ++ switch (policy) { ++ case SCHED_FIFO: ++ case SCHED_RR: ++ ret = MAX_USER_RT_PRIO-1; ++ break; ++ case SCHED_NORMAL: ++ case SCHED_BATCH: ++ case SCHED_ISO: ++ case SCHED_IDLE: ++ ret = 0; ++ break; ++ } ++ return ret; ++} ++ ++/** ++ * sys_sched_get_priority_min - return minimum RT priority. ++ * @policy: scheduling class. ++ * ++ * Return: On success, this syscall returns the minimum ++ * rt_priority that can be used by a given scheduling class. ++ * On failure, a negative error code is returned. ++ */ ++SYSCALL_DEFINE1(sched_get_priority_min, int, policy) ++{ ++ int ret = -EINVAL; ++ ++ switch (policy) { ++ case SCHED_FIFO: ++ case SCHED_RR: ++ ret = 1; ++ break; ++ case SCHED_NORMAL: ++ case SCHED_BATCH: ++ case SCHED_ISO: ++ case SCHED_IDLE: ++ ret = 0; ++ break; ++ } ++ return ret; ++} ++ ++static int sched_rr_get_interval(pid_t pid, struct timespec64 *t) ++{ ++ struct task_struct *p; ++ int retval; ++ ++ if (pid < 0) ++ return -EINVAL; ++ ++ retval = -ESRCH; ++ rcu_read_lock(); ++ p = find_process_by_pid(pid); ++ if (!p) ++ goto out_unlock; ++ ++ retval = security_task_getscheduler(p); ++ if (retval) ++ goto out_unlock; ++ rcu_read_unlock(); ++ ++ *t = ns_to_timespec64(MS_TO_NS(rr_interval)); ++ return 0; ++ ++out_unlock: ++ rcu_read_unlock(); ++ return retval; ++} ++ ++/** ++ * sys_sched_rr_get_interval - return the default timeslice of a process. ++ * @pid: pid of the process. ++ * @interval: userspace pointer to the timeslice value. ++ * ++ * ++ * Return: On success, 0 and the timeslice is in @interval. Otherwise, ++ * an error code. ++ */ ++SYSCALL_DEFINE2(sched_rr_get_interval, pid_t, pid, ++ struct __kernel_timespec __user *, interval) ++{ ++ struct timespec64 t; ++ int retval = sched_rr_get_interval(pid, &t); ++ ++ if (retval == 0) ++ retval = put_timespec64(&t, interval); ++ ++ return retval; ++} ++ ++#ifdef CONFIG_COMPAT_32BIT_TIME ++SYSCALL_DEFINE2(sched_rr_get_interval_time32, pid_t, pid, ++ struct old_timespec32 __user *, interval) ++{ ++ struct timespec64 t; ++ int retval = sched_rr_get_interval(pid, &t); ++ ++ if (retval == 0) ++ retval = put_old_timespec32(&t, interval); ++ return retval; ++} ++#endif ++ ++void sched_show_task(struct task_struct *p) ++{ ++ unsigned long free = 0; ++ int ppid; ++ ++ if (!try_get_task_stack(p)) ++ return; ++ ++ printk(KERN_INFO "%-15.15s %c", p->comm, task_state_to_char(p)); ++ ++ if (p->state == TASK_RUNNING) ++ printk(KERN_CONT " running task "); ++#ifdef CONFIG_DEBUG_STACK_USAGE ++ free = stack_not_used(p); ++#endif ++ ppid = 0; ++ rcu_read_lock(); ++ if (pid_alive(p)) ++ ppid = task_pid_nr(rcu_dereference(p->real_parent)); ++ rcu_read_unlock(); ++ printk(KERN_CONT "%5lu %5d %6d 0x%08lx\n", free, ++ task_pid_nr(p), ppid, ++ (unsigned long)task_thread_info(p)->flags); ++ ++ print_worker_info(KERN_INFO, p); ++ show_stack(p, NULL); ++ put_task_stack(p); ++} ++EXPORT_SYMBOL_GPL(sched_show_task); ++ ++static inline bool ++state_filter_match(unsigned long state_filter, struct task_struct *p) ++{ ++ /* no filter, everything matches */ ++ if (!state_filter) ++ return true; ++ ++ /* filter, but doesn't match */ ++ if (!(p->state & state_filter)) ++ return false; ++ ++ /* ++ * When looking for TASK_UNINTERRUPTIBLE skip TASK_IDLE (allows ++ * TASK_KILLABLE). ++ */ ++ if (state_filter == TASK_UNINTERRUPTIBLE && p->state == TASK_IDLE) ++ return false; ++ ++ return true; ++} ++ ++ ++void show_state_filter(unsigned long state_filter) ++{ ++ struct task_struct *g, *p; ++ ++#if BITS_PER_LONG == 32 ++ printk(KERN_INFO ++ " task PC stack pid father\n"); ++#else ++ printk(KERN_INFO ++ " task PC stack pid father\n"); ++#endif ++ rcu_read_lock(); ++ for_each_process_thread(g, p) { ++ /* ++ * reset the NMI-timeout, listing all files on a slow ++ * console might take a lot of time: ++ * Also, reset softlockup watchdogs on all CPUs, because ++ * another CPU might be blocked waiting for us to process ++ * an IPI. ++ */ ++ touch_nmi_watchdog(); ++ touch_all_softlockup_watchdogs(); ++ if (state_filter_match(state_filter, p)) ++ sched_show_task(p); ++ } ++ ++#ifdef CONFIG_SCHED_DEBUG ++ /* PDS TODO: should support this ++ if (!state_filter) ++ sysrq_sched_debug_show(); ++ */ ++#endif ++ rcu_read_unlock(); ++ /* ++ * Only show locks if all tasks are dumped: ++ */ ++ if (!state_filter) ++ debug_show_all_locks(); ++} ++ ++void dump_cpu_task(int cpu) ++{ ++ pr_info("Task dump for CPU %d:\n", cpu); ++ sched_show_task(cpu_curr(cpu)); ++} ++ ++/** ++ * init_idle - set up an idle thread for a given CPU ++ * @idle: task in question ++ * @cpu: cpu the idle task belongs to ++ * ++ * NOTE: this function does not set the idle thread's NEED_RESCHED ++ * flag, to make booting more robust. ++ */ ++void init_idle(struct task_struct *idle, int cpu) ++{ ++ struct rq *rq = cpu_rq(cpu); ++ unsigned long flags; ++ ++ raw_spin_lock_irqsave(&idle->pi_lock, flags); ++ raw_spin_lock(&rq->lock); ++ update_rq_clock(rq); ++ ++ idle->last_ran = rq->clock_task; ++ idle->state = TASK_RUNNING; ++ idle->flags |= PF_IDLE; ++ /* Setting prio to illegal value shouldn't matter when never queued */ ++ idle->prio = PRIO_LIMIT; ++ idle->deadline = rq_clock(rq) + task_deadline_diff(idle); ++ update_task_priodl(idle); ++ ++ kasan_unpoison_task_stack(idle); ++ ++#ifdef CONFIG_SMP ++ /* ++ * It's possible that init_idle() gets called multiple times on a task, ++ * in that case do_set_cpus_allowed() will not do the right thing. ++ * ++ * And since this is boot we can forgo the serialisation. ++ */ ++ set_cpus_allowed_common(idle, cpumask_of(cpu)); ++#endif ++ ++ /* Silence PROVE_RCU */ ++ rcu_read_lock(); ++ __set_task_cpu(idle, cpu); ++ rcu_read_unlock(); ++ ++ rq->idle = idle; ++ rcu_assign_pointer(rq->curr, idle); ++ idle->on_cpu = 1; ++ ++ raw_spin_unlock(&rq->lock); ++ raw_spin_unlock_irqrestore(&idle->pi_lock, flags); ++ ++ /* Set the preempt count _outside_ the spinlocks! */ ++ init_idle_preempt_count(idle, cpu); ++ ++ ftrace_graph_init_idle_task(idle, cpu); ++ vtime_init_idle(idle, cpu); ++#ifdef CONFIG_SMP ++ sprintf(idle->comm, "%s/%d", INIT_TASK_COMM, cpu); ++#endif ++} ++ ++void resched_cpu(int cpu) ++{ ++ struct rq *rq = cpu_rq(cpu); ++ unsigned long flags; ++ ++ raw_spin_lock_irqsave(&rq->lock, flags); ++ if (cpu_online(cpu) || cpu == smp_processor_id()) ++ resched_curr(cpu_rq(cpu)); ++ raw_spin_unlock_irqrestore(&rq->lock, flags); ++} ++ ++static bool __wake_q_add(struct wake_q_head *head, struct task_struct *task) ++{ ++ struct wake_q_node *node = &task->wake_q; ++ ++ /* ++ * Atomically grab the task, if ->wake_q is !nil already it means ++ * its already queued (either by us or someone else) and will get the ++ * wakeup due to that. ++ * ++ * In order to ensure that a pending wakeup will observe our pending ++ * state, even in the failed case, an explicit smp_mb() must be used. ++ */ ++ smp_mb__before_atomic(); ++ if (unlikely(cmpxchg_relaxed(&node->next, NULL, WAKE_Q_TAIL))) ++ return false; ++ ++ /* ++ * The head is context local, there can be no concurrency. ++ */ ++ *head->lastp = node; ++ head->lastp = &node->next; ++ return true; ++} ++ ++/** ++ * wake_q_add() - queue a wakeup for 'later' waking. ++ * @head: the wake_q_head to add @task to ++ * @task: the task to queue for 'later' wakeup ++ * ++ * Queue a task for later wakeup, most likely by the wake_up_q() call in the ++ * same context, _HOWEVER_ this is not guaranteed, the wakeup can come ++ * instantly. ++ * ++ * This function must be used as-if it were wake_up_process(); IOW the task ++ * must be ready to be woken at this location. ++ */ ++void wake_q_add(struct wake_q_head *head, struct task_struct *task) ++{ ++ if (__wake_q_add(head, task)) ++ get_task_struct(task); ++} ++ ++/** ++ * wake_q_add_safe() - safely queue a wakeup for 'later' waking. ++ * @head: the wake_q_head to add @task to ++ * @task: the task to queue for 'later' wakeup ++ * ++ * Queue a task for later wakeup, most likely by the wake_up_q() call in the ++ * same context, _HOWEVER_ this is not guaranteed, the wakeup can come ++ * instantly. ++ * ++ * This function must be used as-if it were wake_up_process(); IOW the task ++ * must be ready to be woken at this location. ++ * ++ * This function is essentially a task-safe equivalent to wake_q_add(). Callers ++ * that already hold reference to @task can call the 'safe' version and trust ++ * wake_q to do the right thing depending whether or not the @task is already ++ * queued for wakeup. ++ */ ++void wake_q_add_safe(struct wake_q_head *head, struct task_struct *task) ++{ ++ if (!__wake_q_add(head, task)) ++ put_task_struct(task); ++} ++ ++void wake_up_q(struct wake_q_head *head) ++{ ++ struct wake_q_node *node = head->first; ++ ++ while (node != WAKE_Q_TAIL) { ++ struct task_struct *task; ++ ++ task = container_of(node, struct task_struct, wake_q); ++ BUG_ON(!task); ++ /* task can safely be re-inserted now: */ ++ node = node->next; ++ task->wake_q.next = NULL; ++ ++ /* ++ * wake_up_process() executes a full barrier, which pairs with ++ * the queueing in wake_q_add() so as not to miss wakeups. ++ */ ++ wake_up_process(task); ++ put_task_struct(task); ++ } ++} ++ ++#ifdef CONFIG_SMP ++ ++int cpuset_cpumask_can_shrink(const struct cpumask __maybe_unused *cur, ++ const struct cpumask __maybe_unused *trial) ++{ ++ return 1; ++} ++ ++int task_can_attach(struct task_struct *p, ++ const struct cpumask *cs_cpus_allowed) ++{ ++ int ret = 0; ++ ++ /* ++ * Kthreads which disallow setaffinity shouldn't be moved ++ * to a new cpuset; we don't want to change their CPU ++ * affinity and isolating such threads by their set of ++ * allowed nodes is unnecessary. Thus, cpusets are not ++ * applicable for such threads. This prevents checking for ++ * success of set_cpus_allowed_ptr() on all attached tasks ++ * before cpus_mask may be changed. ++ */ ++ if (p->flags & PF_NO_SETAFFINITY) ++ ret = -EINVAL; ++ ++ return ret; ++} ++ ++static bool sched_smp_initialized __read_mostly; ++ ++#ifdef CONFIG_NO_HZ_COMMON ++void nohz_balance_enter_idle(int cpu) ++{ ++} ++ ++void select_nohz_load_balancer(int stop_tick) ++{ ++} ++ ++void set_cpu_sd_state_idle(void) {} ++ ++/* ++ * In the semi idle case, use the nearest busy CPU for migrating timers ++ * from an idle CPU. This is good for power-savings. ++ * ++ * We don't do similar optimization for completely idle system, as ++ * selecting an idle CPU will add more delays to the timers than intended ++ * (as that CPU's timer base may not be uptodate wrt jiffies etc). ++ */ ++int get_nohz_timer_target(void) ++{ ++ int i, cpu = smp_processor_id(); ++ struct cpumask *mask; ++ ++ if (!idle_cpu(cpu) && housekeeping_cpu(cpu, HK_FLAG_TIMER)) ++ return cpu; ++ ++ for (mask = &(per_cpu(sched_cpu_affinity_chk_masks, cpu)[0]); ++ mask < per_cpu(sched_cpu_affinity_chk_end_masks, cpu); mask++) ++ for_each_cpu(i, mask) ++ if (!idle_cpu(i) && housekeeping_cpu(i, HK_FLAG_TIMER)) ++ return i; ++ ++ if (!housekeeping_cpu(cpu, HK_FLAG_TIMER)) ++ cpu = housekeeping_any_cpu(HK_FLAG_TIMER); ++ ++ return cpu; ++} ++ ++/* ++ * When add_timer_on() enqueues a timer into the timer wheel of an ++ * idle CPU then this timer might expire before the next timer event ++ * which is scheduled to wake up that CPU. In case of a completely ++ * idle system the next event might even be infinite time into the ++ * future. wake_up_idle_cpu() ensures that the CPU is woken up and ++ * leaves the inner idle loop so the newly added timer is taken into ++ * account when the CPU goes back to idle and evaluates the timer ++ * wheel for the next timer event. ++ */ ++void wake_up_idle_cpu(int cpu) ++{ ++ if (cpu == smp_processor_id()) ++ return; ++ ++ set_tsk_need_resched(cpu_rq(cpu)->idle); ++ smp_send_reschedule(cpu); ++} ++ ++void wake_up_nohz_cpu(int cpu) ++{ ++ wake_up_idle_cpu(cpu); ++} ++#endif /* CONFIG_NO_HZ_COMMON */ ++ ++#ifdef CONFIG_HOTPLUG_CPU ++/* ++ * Ensures that the idle task is using init_mm right before its CPU goes ++ * offline. ++ */ ++void idle_task_exit(void) ++{ ++ struct mm_struct *mm = current->active_mm; ++ ++ BUG_ON(cpu_online(smp_processor_id())); ++ ++ if (mm != &init_mm) { ++ switch_mm(mm, &init_mm, current); ++ current->active_mm = &init_mm; ++ finish_arch_post_lock_switch(); ++ } ++ mmdrop(mm); ++} ++ ++/* ++ * Migrate all tasks from the rq, sleeping tasks will be migrated by ++ * try_to_wake_up()->select_task_rq(). ++ * ++ * Called with rq->lock held even though we'er in stop_machine() and ++ * there's no concurrency possible, we hold the required locks anyway ++ * because of lock validation efforts. ++ */ ++static void migrate_tasks(struct rq *dead_rq) ++{ ++ struct rq *rq = dead_rq; ++ struct task_struct *p, *stop = rq->stop; ++ struct skiplist_node *node; ++ int count = 0; ++ ++ /* ++ * Fudge the rq selection such that the below task selection loop ++ * doesn't get stuck on the currently eligible stop task. ++ * ++ * We're currently inside stop_machine() and the rq is either stuck ++ * in the stop_machine_cpu_stop() loop, or we're executing this code, ++ * either way we should never end up calling schedule() until we're ++ * done here. ++ */ ++ rq->stop = NULL; ++ ++ node = &rq->sl_header; ++ while ((node = node->next[0]) != &rq->sl_header) { ++ int dest_cpu; ++ ++ p = skiplist_entry(node, struct task_struct, sl_node); ++ ++ /* skip the running task */ ++ if (task_running(p)) ++ continue; ++ ++ /* ++ * Rules for changing task_struct::cpus_mask are holding ++ * both pi_lock and rq->lock, such that holding either ++ * stabilizes the mask. ++ * ++ * Drop rq->lock is not quite as disastrous as it usually is ++ * because !cpu_active at this point, which means load-balance ++ * will not interfere. Also, stop-machine. ++ */ ++ raw_spin_unlock(&rq->lock); ++ raw_spin_lock(&p->pi_lock); ++ raw_spin_lock(&rq->lock); ++ ++ /* ++ * Since we're inside stop-machine, _nothing_ should have ++ * changed the task, WARN if weird stuff happened, because in ++ * that case the above rq->lock drop is a fail too. ++ */ ++ if (WARN_ON(task_rq(p) != rq || !task_on_rq_queued(p))) { ++ raw_spin_unlock(&p->pi_lock); ++ continue; ++ } ++ ++ count++; ++ /* Find suitable destination for @next, with force if needed. */ ++ dest_cpu = select_fallback_rq(dead_rq->cpu, p); ++ ++ rq = __migrate_task(rq, p, dest_cpu); ++ raw_spin_unlock(&rq->lock); ++ raw_spin_unlock(&p->pi_lock); ++ ++ rq = dead_rq; ++ raw_spin_lock(&rq->lock); ++ /* Check queued task all over from the header again */ ++ node = &rq->sl_header; ++ } ++ ++ rq->stop = stop; ++} ++ ++static void set_rq_offline(struct rq *rq) ++{ ++ if (rq->online) ++ rq->online = false; ++} ++#endif /* CONFIG_HOTPLUG_CPU */ ++ ++static void set_rq_online(struct rq *rq) ++{ ++ if (!rq->online) ++ rq->online = true; ++} ++ ++#ifdef CONFIG_SCHED_DEBUG ++ ++static __read_mostly int sched_debug_enabled; ++ ++static int __init sched_debug_setup(char *str) ++{ ++ sched_debug_enabled = 1; ++ ++ return 0; ++} ++early_param("sched_debug", sched_debug_setup); ++ ++static inline bool sched_debug(void) ++{ ++ return sched_debug_enabled; ++} ++#else /* !CONFIG_SCHED_DEBUG */ ++static inline bool sched_debug(void) ++{ ++ return false; ++} ++#endif /* CONFIG_SCHED_DEBUG */ ++ ++#ifdef CONFIG_SMP ++void scheduler_ipi(void) ++{ ++ /* ++ * Fold TIF_NEED_RESCHED into the preempt_count; anybody setting ++ * TIF_NEED_RESCHED remotely (for the first time) will also send ++ * this IPI. ++ */ ++ preempt_fold_need_resched(); ++ ++ if (!idle_cpu(smp_processor_id()) || need_resched()) ++ return; ++ ++ irq_enter(); ++ irq_exit(); ++} ++ ++void wake_up_if_idle(int cpu) ++{ ++ struct rq *rq = cpu_rq(cpu); ++ unsigned long flags; ++ ++ rcu_read_lock(); ++ ++ if (!is_idle_task(rcu_dereference(rq->curr))) ++ goto out; ++ ++ if (set_nr_if_polling(rq->idle)) { ++ trace_sched_wake_idle_without_ipi(cpu); ++ } else { ++ raw_spin_lock_irqsave(&rq->lock, flags); ++ if (is_idle_task(rq->curr)) ++ smp_send_reschedule(cpu); ++ /* Else CPU is not idle, do nothing here */ ++ raw_spin_unlock_irqrestore(&rq->lock, flags); ++ } ++ ++out: ++ rcu_read_unlock(); ++} ++ ++bool cpus_share_cache(int this_cpu, int that_cpu) ++{ ++ return per_cpu(sd_llc_id, this_cpu) == per_cpu(sd_llc_id, that_cpu); ++} ++#endif /* CONFIG_SMP */ ++ ++/* ++ * Topology list, bottom-up. ++ */ ++static struct sched_domain_topology_level default_topology[] = { ++#ifdef CONFIG_SCHED_SMT ++ { cpu_smt_mask, cpu_smt_flags, SD_INIT_NAME(SMT) }, ++#endif ++#ifdef CONFIG_SCHED_MC ++ { cpu_coregroup_mask, cpu_core_flags, SD_INIT_NAME(MC) }, ++#endif ++ { cpu_cpu_mask, SD_INIT_NAME(DIE) }, ++ { NULL, }, ++}; ++ ++static struct sched_domain_topology_level *sched_domain_topology = ++ default_topology; ++ ++#define for_each_sd_topology(tl) \ ++ for (tl = sched_domain_topology; tl->mask; tl++) ++ ++void set_sched_topology(struct sched_domain_topology_level *tl) ++{ ++ if (WARN_ON_ONCE(sched_smp_initialized)) ++ return; ++ ++ sched_domain_topology = tl; ++} ++ ++/* ++ * Initializers for schedule domains ++ * Non-inlined to reduce accumulated stack pressure in build_sched_domains() ++ */ ++ ++int sched_domain_level_max; ++ ++/* ++ * Partition sched domains as specified by the 'ndoms_new' ++ * cpumasks in the array doms_new[] of cpumasks. This compares ++ * doms_new[] to the current sched domain partitioning, doms_cur[]. ++ * It destroys each deleted domain and builds each new domain. ++ * ++ * 'doms_new' is an array of cpumask_var_t's of length 'ndoms_new'. ++ * The masks don't intersect (don't overlap.) We should setup one ++ * sched domain for each mask. CPUs not in any of the cpumasks will ++ * not be load balanced. If the same cpumask appears both in the ++ * current 'doms_cur' domains and in the new 'doms_new', we can leave ++ * it as it is. ++ * ++ * The passed in 'doms_new' should be allocated using ++ * alloc_sched_domains. This routine takes ownership of it and will ++ * free_sched_domains it when done with it. If the caller failed the ++ * alloc call, then it can pass in doms_new == NULL && ndoms_new == 1, ++ * and partition_sched_domains() will fallback to the single partition ++ * 'fallback_doms', it also forces the domains to be rebuilt. ++ * ++ * If doms_new == NULL it will be replaced with cpu_online_mask. ++ * ndoms_new == 0 is a special case for destroying existing domains, ++ * and it will not create the default domain. ++ * ++ * Call with hotplug lock held ++ */ ++void partition_sched_domains(int ndoms_new, cpumask_var_t doms_new[], ++ struct sched_domain_attr *dattr_new) ++{ ++ /** ++ * PDS doesn't depend on sched domains, but just keep this api ++ */ ++} ++ ++/* ++ * used to mark begin/end of suspend/resume: ++ */ ++static int num_cpus_frozen; ++ ++#ifdef CONFIG_NUMA ++int __read_mostly node_reclaim_distance = RECLAIM_DISTANCE; ++ ++/* ++ * sched_numa_find_closest() - given the NUMA topology, find the cpu ++ * closest to @cpu from @cpumask. ++ * cpumask: cpumask to find a cpu from ++ * cpu: cpu to be close to ++ * ++ * returns: cpu, or nr_cpu_ids when nothing found. ++ */ ++int sched_numa_find_closest(const struct cpumask *cpus, int cpu) ++{ ++ return best_mask_cpu(cpu, cpus); ++} ++#endif /* CONFIG_NUMA */ ++ ++/* ++ * Update cpusets according to cpu_active mask. If cpusets are ++ * disabled, cpuset_update_active_cpus() becomes a simple wrapper ++ * around partition_sched_domains(). ++ * ++ * If we come here as part of a suspend/resume, don't touch cpusets because we ++ * want to restore it back to its original state upon resume anyway. ++ */ ++static void cpuset_cpu_active(void) ++{ ++ if (cpuhp_tasks_frozen) { ++ /* ++ * num_cpus_frozen tracks how many CPUs are involved in suspend ++ * resume sequence. As long as this is not the last online ++ * operation in the resume sequence, just build a single sched ++ * domain, ignoring cpusets. ++ */ ++ partition_sched_domains(1, NULL, NULL); ++ if (--num_cpus_frozen) ++ return; ++ /* ++ * This is the last CPU online operation. So fall through and ++ * restore the original sched domains by considering the ++ * cpuset configurations. ++ */ ++ cpuset_force_rebuild(); ++ } ++ ++ cpuset_update_active_cpus(); ++} ++ ++static int cpuset_cpu_inactive(unsigned int cpu) ++{ ++ if (!cpuhp_tasks_frozen) { ++ cpuset_update_active_cpus(); ++ } else { ++ num_cpus_frozen++; ++ partition_sched_domains(1, NULL, NULL); ++ } ++ return 0; ++} ++ ++int sched_cpu_activate(unsigned int cpu) ++{ ++ struct rq *rq = cpu_rq(cpu); ++ unsigned long flags; ++ ++#ifdef CONFIG_SCHED_SMT ++ /* ++ * When going up, increment the number of cores with SMT present. ++ */ ++ if (cpumask_weight(cpu_smt_mask(cpu)) == 2) ++ static_branch_inc_cpuslocked(&sched_smt_present); ++#endif ++ set_cpu_active(cpu, true); ++ ++ if (sched_smp_initialized) ++ cpuset_cpu_active(); ++ ++ /* ++ * Put the rq online, if not already. This happens: ++ * ++ * 1) In the early boot process, because we build the real domains ++ * after all cpus have been brought up. ++ * ++ * 2) At runtime, if cpuset_cpu_active() fails to rebuild the ++ * domains. ++ */ ++ raw_spin_lock_irqsave(&rq->lock, flags); ++ set_rq_online(rq); ++ raw_spin_unlock_irqrestore(&rq->lock, flags); ++ ++ return 0; ++} ++ ++int sched_cpu_deactivate(unsigned int cpu) ++{ ++ int ret; ++ ++ set_cpu_active(cpu, false); ++ /* ++ * We've cleared cpu_active_mask, wait for all preempt-disabled and RCU ++ * users of this state to go away such that all new such users will ++ * observe it. ++ * ++ * Do sync before park smpboot threads to take care the rcu boost case. ++ */ ++ synchronize_rcu(); ++ ++#ifdef CONFIG_SCHED_SMT ++ /* ++ * When going down, decrement the number of cores with SMT present. ++ */ ++ if (cpumask_weight(cpu_smt_mask(cpu)) == 2) ++ static_branch_dec_cpuslocked(&sched_smt_present); ++#endif ++ ++ if (!sched_smp_initialized) ++ return 0; ++ ++ ret = cpuset_cpu_inactive(cpu); ++ if (ret) { ++ set_cpu_active(cpu, true); ++ return ret; ++ } ++ return 0; ++} ++ ++static void sched_rq_cpu_starting(unsigned int cpu) ++{ ++ struct rq *rq = cpu_rq(cpu); ++ ++ rq->calc_load_update = calc_load_update; ++} ++ ++int sched_cpu_starting(unsigned int cpu) ++{ ++ sched_rq_cpu_starting(cpu); ++ sched_tick_start(cpu); ++ return 0; ++} ++ ++#ifdef CONFIG_HOTPLUG_CPU ++int sched_cpu_dying(unsigned int cpu) ++{ ++ struct rq *rq = cpu_rq(cpu); ++ unsigned long flags; ++ ++ sched_tick_stop(cpu); ++ raw_spin_lock_irqsave(&rq->lock, flags); ++ set_rq_offline(rq); ++ migrate_tasks(rq); ++ raw_spin_unlock_irqrestore(&rq->lock, flags); ++ ++ hrtick_clear(rq); ++ return 0; ++} ++#endif ++ ++#ifdef CONFIG_SMP ++static void sched_init_topology_cpumask_early(void) ++{ ++ int cpu, level; ++ cpumask_t *tmp; ++ ++ for_each_possible_cpu(cpu) { ++ for (level = 0; level < NR_CPU_AFFINITY_CHK_LEVEL; level++) { ++ tmp = &(per_cpu(sched_cpu_affinity_chk_masks, cpu)[level]); ++ cpumask_copy(tmp, cpu_possible_mask); ++ cpumask_clear_cpu(cpu, tmp); ++ } ++ per_cpu(sched_cpu_llc_start_mask, cpu) = ++ &(per_cpu(sched_cpu_affinity_chk_masks, cpu)[0]); ++ per_cpu(sched_cpu_affinity_chk_end_masks, cpu) = ++ &(per_cpu(sched_cpu_affinity_chk_masks, cpu)[1]); ++ } ++} ++ ++static void sched_init_topology_cpumask(void) ++{ ++ int cpu; ++ cpumask_t *chk; ++ ++ for_each_online_cpu(cpu) { ++ chk = &(per_cpu(sched_cpu_affinity_chk_masks, cpu)[0]); ++ ++#ifdef CONFIG_SCHED_SMT ++ cpumask_setall(chk); ++ cpumask_clear_cpu(cpu, chk); ++ if (cpumask_and(chk, chk, topology_sibling_cpumask(cpu))) { ++ per_cpu(sched_sibling_cpu, cpu) = cpumask_first(chk); ++ printk(KERN_INFO "pds: cpu #%d affinity check mask - smt 0x%08lx", ++ cpu, (chk++)->bits[0]); ++ } ++#endif ++#ifdef CONFIG_SCHED_MC ++ cpumask_setall(chk); ++ cpumask_clear_cpu(cpu, chk); ++ if (cpumask_and(chk, chk, cpu_coregroup_mask(cpu))) { ++ per_cpu(sched_cpu_llc_start_mask, cpu) = chk; ++ printk(KERN_INFO "pds: cpu #%d affinity check mask - coregroup 0x%08lx", ++ cpu, (chk++)->bits[0]); ++ } ++ cpumask_complement(chk, cpu_coregroup_mask(cpu)); ++ ++ /** ++ * Set up sd_llc_id per CPU ++ */ ++ per_cpu(sd_llc_id, cpu) = ++ cpumask_first(cpu_coregroup_mask(cpu)); ++#else ++ per_cpu(sd_llc_id, cpu) = ++ cpumask_first(topology_core_cpumask(cpu)); ++ ++ per_cpu(sched_cpu_llc_start_mask, cpu) = chk; ++ ++ cpumask_setall(chk); ++ cpumask_clear_cpu(cpu, chk); ++#endif /* NOT CONFIG_SCHED_MC */ ++ if (cpumask_and(chk, chk, topology_core_cpumask(cpu))) ++ printk(KERN_INFO "pds: cpu #%d affinity check mask - core 0x%08lx", ++ cpu, (chk++)->bits[0]); ++ cpumask_complement(chk, topology_core_cpumask(cpu)); ++ ++ if (cpumask_and(chk, chk, cpu_online_mask)) ++ printk(KERN_INFO "pds: cpu #%d affinity check mask - others 0x%08lx", ++ cpu, (chk++)->bits[0]); ++ ++ per_cpu(sched_cpu_affinity_chk_end_masks, cpu) = chk; ++ } ++} ++#endif ++ ++void __init sched_init_smp(void) ++{ ++ /* Move init over to a non-isolated CPU */ ++ if (set_cpus_allowed_ptr(current, housekeeping_cpumask(HK_FLAG_DOMAIN)) < 0) ++ BUG(); ++ ++ cpumask_copy(&sched_rq_queued_masks[SCHED_RQ_EMPTY], cpu_online_mask); ++ ++ sched_init_topology_cpumask(); ++ ++ sched_smp_initialized = true; ++} ++#else ++void __init sched_init_smp(void) ++{ ++} ++#endif /* CONFIG_SMP */ ++ ++int in_sched_functions(unsigned long addr) ++{ ++ return in_lock_functions(addr) || ++ (addr >= (unsigned long)__sched_text_start ++ && addr < (unsigned long)__sched_text_end); ++} ++ ++#ifdef CONFIG_CGROUP_SCHED ++/* task group related information */ ++struct task_group { ++ struct cgroup_subsys_state css; ++ ++ struct rcu_head rcu; ++ struct list_head list; ++ ++ struct task_group *parent; ++ struct list_head siblings; ++ struct list_head children; ++}; ++ ++/* ++ * Default task group. ++ * Every task in system belongs to this group at bootup. ++ */ ++struct task_group root_task_group; ++LIST_HEAD(task_groups); ++ ++/* Cacheline aligned slab cache for task_group */ ++static struct kmem_cache *task_group_cache __read_mostly; ++#endif /* CONFIG_CGROUP_SCHED */ ++ ++void __init sched_init(void) ++{ ++ int i; ++ struct rq *rq; ++ ++ print_scheduler_version(); ++ ++ wait_bit_init(); ++ ++#ifdef CONFIG_SMP ++ for (i = 0; i < NR_SCHED_RQ_QUEUED_LEVEL; i++) ++ cpumask_clear(&sched_rq_queued_masks[i]); ++ cpumask_setall(&sched_rq_queued_masks[SCHED_RQ_EMPTY]); ++ set_bit(SCHED_RQ_EMPTY, sched_rq_queued_masks_bitmap); ++ ++ cpumask_setall(&sched_rq_pending_masks[SCHED_RQ_EMPTY]); ++ set_bit(SCHED_RQ_EMPTY, sched_rq_pending_masks_bitmap); ++#else ++ uprq = &per_cpu(runqueues, 0); ++#endif ++ ++#ifdef CONFIG_CGROUP_SCHED ++ task_group_cache = KMEM_CACHE(task_group, 0); ++ ++ list_add(&root_task_group.list, &task_groups); ++ INIT_LIST_HEAD(&root_task_group.children); ++ INIT_LIST_HEAD(&root_task_group.siblings); ++#endif /* CONFIG_CGROUP_SCHED */ ++ for_each_possible_cpu(i) { ++ rq = cpu_rq(i); ++ FULL_INIT_SKIPLIST_NODE(&rq->sl_header); ++ raw_spin_lock_init(&rq->lock); ++ rq->dither = 0; ++ rq->nr_running = rq->nr_uninterruptible = 0; ++ rq->calc_load_active = 0; ++ rq->calc_load_update = jiffies + LOAD_FREQ; ++#ifdef CONFIG_SMP ++ rq->online = false; ++ rq->cpu = i; ++ ++ rq->queued_level = SCHED_RQ_EMPTY; ++ rq->pending_level = SCHED_RQ_EMPTY; ++#ifdef CONFIG_SCHED_SMT ++ per_cpu(sched_sibling_cpu, i) = i; ++ rq->active_balance = 0; ++#endif ++#endif ++ rq->nr_switches = 0; ++ atomic_set(&rq->nr_iowait, 0); ++ hrtick_rq_init(rq); ++ } ++#ifdef CONFIG_SMP ++ /* Set rq->online for cpu 0 */ ++ cpu_rq(0)->online = true; ++#endif ++ ++ /* ++ * The boot idle thread does lazy MMU switching as well: ++ */ ++ mmgrab(&init_mm); ++ enter_lazy_tlb(&init_mm, current); ++ ++ /* ++ * Make us the idle thread. Technically, schedule() should not be ++ * called from this thread, however somewhere below it might be, ++ * but because we are the idle thread, we just pick up running again ++ * when this runqueue becomes "idle". ++ */ ++ init_idle(current, smp_processor_id()); ++ ++ calc_load_update = jiffies + LOAD_FREQ; ++ ++#ifdef CONFIG_SMP ++ idle_thread_set_boot_cpu(); ++ ++ sched_init_topology_cpumask_early(); ++#endif /* SMP */ ++ ++ init_schedstats(); ++ ++ psi_init(); ++} ++ ++#ifdef CONFIG_DEBUG_ATOMIC_SLEEP ++static inline int preempt_count_equals(int preempt_offset) ++{ ++ int nested = preempt_count() + rcu_preempt_depth(); ++ ++ return (nested == preempt_offset); ++} ++ ++void __might_sleep(const char *file, int line, int preempt_offset) ++{ ++ /* ++ * Blocking primitives will set (and therefore destroy) current->state, ++ * since we will exit with TASK_RUNNING make sure we enter with it, ++ * otherwise we will destroy state. ++ */ ++ WARN_ONCE(current->state != TASK_RUNNING && current->task_state_change, ++ "do not call blocking ops when !TASK_RUNNING; " ++ "state=%lx set at [<%p>] %pS\n", ++ current->state, ++ (void *)current->task_state_change, ++ (void *)current->task_state_change); ++ ++ ___might_sleep(file, line, preempt_offset); ++} ++EXPORT_SYMBOL(__might_sleep); ++ ++void ___might_sleep(const char *file, int line, int preempt_offset) ++{ ++ /* Ratelimiting timestamp: */ ++ static unsigned long prev_jiffy; ++ ++ unsigned long preempt_disable_ip; ++ ++ /* WARN_ON_ONCE() by default, no rate limit required: */ ++ rcu_sleep_check(); ++ ++ if ((preempt_count_equals(preempt_offset) && !irqs_disabled() && ++ !is_idle_task(current) && !current->non_block_count) || ++ system_state == SYSTEM_BOOTING || system_state > SYSTEM_RUNNING || ++ oops_in_progress) ++ return; ++ if (time_before(jiffies, prev_jiffy + HZ) && prev_jiffy) ++ return; ++ prev_jiffy = jiffies; ++ ++ /* Save this before calling printk(), since that will clobber it: */ ++ preempt_disable_ip = get_preempt_disable_ip(current); ++ ++ printk(KERN_ERR ++ "BUG: sleeping function called from invalid context at %s:%d\n", ++ file, line); ++ printk(KERN_ERR ++ "in_atomic(): %d, irqs_disabled(): %d, non_block: %d, pid: %d, name: %s\n", ++ in_atomic(), irqs_disabled(), current->non_block_count, ++ current->pid, current->comm); ++ ++ if (task_stack_end_corrupted(current)) ++ printk(KERN_EMERG "Thread overran stack, or stack corrupted\n"); ++ ++ debug_show_held_locks(current); ++ if (irqs_disabled()) ++ print_irqtrace_events(current); ++#ifdef CONFIG_DEBUG_PREEMPT ++ if (!preempt_count_equals(preempt_offset)) { ++ pr_err("Preemption disabled at:"); ++ print_ip_sym(preempt_disable_ip); ++ pr_cont("\n"); ++ } ++#endif ++ dump_stack(); ++ add_taint(TAINT_WARN, LOCKDEP_STILL_OK); ++} ++EXPORT_SYMBOL(___might_sleep); ++ ++void __cant_sleep(const char *file, int line, int preempt_offset) ++{ ++ static unsigned long prev_jiffy; ++ ++ if (irqs_disabled()) ++ return; ++ ++ if (!IS_ENABLED(CONFIG_PREEMPT_COUNT)) ++ return; ++ ++ if (preempt_count() > preempt_offset) ++ return; ++ ++ if (time_before(jiffies, prev_jiffy + HZ) && prev_jiffy) ++ return; ++ prev_jiffy = jiffies; ++ ++ printk(KERN_ERR "BUG: assuming atomic context at %s:%d\n", file, line); ++ printk(KERN_ERR "in_atomic(): %d, irqs_disabled(): %d, pid: %d, name: %s\n", ++ in_atomic(), irqs_disabled(), ++ current->pid, current->comm); ++ ++ debug_show_held_locks(current); ++ dump_stack(); ++ add_taint(TAINT_WARN, LOCKDEP_STILL_OK); ++} ++EXPORT_SYMBOL_GPL(__cant_sleep); ++#endif ++ ++#ifdef CONFIG_MAGIC_SYSRQ ++void normalize_rt_tasks(void) ++{ ++ struct task_struct *g, *p; ++ struct sched_attr attr = { ++ .sched_policy = SCHED_NORMAL, ++ }; ++ ++ read_lock(&tasklist_lock); ++ for_each_process_thread(g, p) { ++ /* ++ * Only normalize user tasks: ++ */ ++ if (p->flags & PF_KTHREAD) ++ continue; ++ ++ if (!rt_task(p)) { ++ /* ++ * Renice negative nice level userspace ++ * tasks back to 0: ++ */ ++ if (task_nice(p) < 0) ++ set_user_nice(p, 0); ++ continue; ++ } ++ ++ __sched_setscheduler(p, &attr, false, false); ++ } ++ read_unlock(&tasklist_lock); ++} ++#endif /* CONFIG_MAGIC_SYSRQ */ ++ ++#if defined(CONFIG_IA64) || defined(CONFIG_KGDB_KDB) ++/* ++ * These functions are only useful for the IA64 MCA handling, or kdb. ++ * ++ * They can only be called when the whole system has been ++ * stopped - every CPU needs to be quiescent, and no scheduling ++ * activity can take place. Using them for anything else would ++ * be a serious bug, and as a result, they aren't even visible ++ * under any other configuration. ++ */ ++ ++/** ++ * curr_task - return the current task for a given CPU. ++ * @cpu: the processor in question. ++ * ++ * ONLY VALID WHEN THE WHOLE SYSTEM IS STOPPED! ++ * ++ * Return: The current task for @cpu. ++ */ ++struct task_struct *curr_task(int cpu) ++{ ++ return cpu_curr(cpu); ++} ++ ++#endif /* defined(CONFIG_IA64) || defined(CONFIG_KGDB_KDB) */ ++ ++#ifdef CONFIG_IA64 ++/** ++ * ia64_set_curr_task - set the current task for a given CPU. ++ * @cpu: the processor in question. ++ * @p: the task pointer to set. ++ * ++ * Description: This function must only be used when non-maskable interrupts ++ * are serviced on a separate stack. It allows the architecture to switch the ++ * notion of the current task on a CPU in a non-blocking manner. This function ++ * must be called with all CPU's synchronised, and interrupts disabled, the ++ * and caller must save the original value of the current task (see ++ * curr_task() above) and restore that value before reenabling interrupts and ++ * re-starting the system. ++ * ++ * ONLY VALID WHEN THE WHOLE SYSTEM IS STOPPED! ++ */ ++void ia64_set_curr_task(int cpu, struct task_struct *p) ++{ ++ cpu_curr(cpu) = p; ++} ++ ++#endif ++ ++#ifdef CONFIG_SCHED_DEBUG ++void proc_sched_show_task(struct task_struct *p, struct pid_namespace *ns, ++ struct seq_file *m) ++{} ++ ++void proc_sched_set_task(struct task_struct *p) ++{} ++#endif ++ ++#ifdef CONFIG_CGROUP_SCHED ++static void sched_free_group(struct task_group *tg) ++{ ++ kmem_cache_free(task_group_cache, tg); ++} ++ ++/* allocate runqueue etc for a new task group */ ++struct task_group *sched_create_group(struct task_group *parent) ++{ ++ struct task_group *tg; ++ ++ tg = kmem_cache_alloc(task_group_cache, GFP_KERNEL | __GFP_ZERO); ++ if (!tg) ++ return ERR_PTR(-ENOMEM); ++ ++ return tg; ++} ++ ++void sched_online_group(struct task_group *tg, struct task_group *parent) ++{ ++} ++ ++/* rcu callback to free various structures associated with a task group */ ++static void sched_free_group_rcu(struct rcu_head *rhp) ++{ ++ /* Now it should be safe to free those cfs_rqs */ ++ sched_free_group(container_of(rhp, struct task_group, rcu)); ++} ++ ++void sched_destroy_group(struct task_group *tg) ++{ ++ /* Wait for possible concurrent references to cfs_rqs complete */ ++ call_rcu(&tg->rcu, sched_free_group_rcu); ++} ++ ++void sched_offline_group(struct task_group *tg) ++{ ++} ++ ++static inline struct task_group *css_tg(struct cgroup_subsys_state *css) ++{ ++ return css ? container_of(css, struct task_group, css) : NULL; ++} ++ ++static struct cgroup_subsys_state * ++cpu_cgroup_css_alloc(struct cgroup_subsys_state *parent_css) ++{ ++ struct task_group *parent = css_tg(parent_css); ++ struct task_group *tg; ++ ++ if (!parent) { ++ /* This is early initialization for the top cgroup */ ++ return &root_task_group.css; ++ } ++ ++ tg = sched_create_group(parent); ++ if (IS_ERR(tg)) ++ return ERR_PTR(-ENOMEM); ++ return &tg->css; ++} ++ ++/* Expose task group only after completing cgroup initialization */ ++static int cpu_cgroup_css_online(struct cgroup_subsys_state *css) ++{ ++ struct task_group *tg = css_tg(css); ++ struct task_group *parent = css_tg(css->parent); ++ ++ if (parent) ++ sched_online_group(tg, parent); ++ return 0; ++} ++ ++static void cpu_cgroup_css_released(struct cgroup_subsys_state *css) ++{ ++ struct task_group *tg = css_tg(css); ++ ++ sched_offline_group(tg); ++} ++ ++static void cpu_cgroup_css_free(struct cgroup_subsys_state *css) ++{ ++ struct task_group *tg = css_tg(css); ++ ++ /* ++ * Relies on the RCU grace period between css_released() and this. ++ */ ++ sched_free_group(tg); ++} ++ ++static void cpu_cgroup_fork(struct task_struct *task) ++{ ++} ++ ++static int cpu_cgroup_can_attach(struct cgroup_taskset *tset) ++{ ++ return 0; ++} ++ ++static void cpu_cgroup_attach(struct cgroup_taskset *tset) ++{ ++} ++ ++static struct cftype cpu_legacy_files[] = { ++ { } /* Terminate */ ++}; ++ ++static struct cftype cpu_files[] = { ++ { } /* terminate */ ++}; ++ ++static int cpu_extra_stat_show(struct seq_file *sf, ++ struct cgroup_subsys_state *css) ++{ ++ return 0; ++} ++ ++struct cgroup_subsys cpu_cgrp_subsys = { ++ .css_alloc = cpu_cgroup_css_alloc, ++ .css_online = cpu_cgroup_css_online, ++ .css_released = cpu_cgroup_css_released, ++ .css_free = cpu_cgroup_css_free, ++ .css_extra_stat_show = cpu_extra_stat_show, ++ .fork = cpu_cgroup_fork, ++ .can_attach = cpu_cgroup_can_attach, ++ .attach = cpu_cgroup_attach, ++ .legacy_cftypes = cpu_files, ++ .legacy_cftypes = cpu_legacy_files, ++ .dfl_cftypes = cpu_files, ++ .early_init = true, ++ .threaded = true, ++}; ++#endif /* CONFIG_CGROUP_SCHED */ ++ ++#undef CREATE_TRACE_POINTS +diff --git a/kernel/sched/pds_sched.h b/kernel/sched/pds_sched.h +new file mode 100644 +index 000000000000..b3926a8425b2 +--- /dev/null ++++ b/kernel/sched/pds_sched.h +@@ -0,0 +1,481 @@ ++#ifndef PDS_SCHED_H ++#define PDS_SCHED_H ++ ++#include ++ ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++ ++#include ++ ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++ ++#include ++ ++#ifdef CONFIG_PARAVIRT ++# include ++#endif ++ ++#include "cpupri.h" ++ ++/* task_struct::on_rq states: */ ++#define TASK_ON_RQ_QUEUED 1 ++#define TASK_ON_RQ_MIGRATING 2 ++ ++static inline int task_on_rq_queued(struct task_struct *p) ++{ ++ return p->on_rq == TASK_ON_RQ_QUEUED; ++} ++ ++static inline int task_on_rq_migrating(struct task_struct *p) ++{ ++ return READ_ONCE(p->on_rq) == TASK_ON_RQ_MIGRATING; ++} ++ ++/* ++ * wake flags ++ */ ++#define WF_SYNC 0x01 /* waker goes to sleep after wakeup */ ++#define WF_FORK 0x02 /* child wakeup after fork */ ++#define WF_MIGRATED 0x04 /* internal use, task got migrated */ ++ ++/* ++ * This is the main, per-CPU runqueue data structure. ++ * This data should only be modified by the local cpu. ++ */ ++struct rq { ++ /* runqueue lock: */ ++ raw_spinlock_t lock; ++ ++ struct task_struct *curr, *idle, *stop; ++ struct mm_struct *prev_mm; ++ ++ struct skiplist_node sl_header; ++ ++ /* switch count */ ++ u64 nr_switches; ++ ++ atomic_t nr_iowait; ++ ++#ifdef CONFIG_MEMBARRIER ++ int membarrier_state; ++#endif ++ ++#ifdef CONFIG_SMP ++ int cpu; /* cpu of this runqueue */ ++ bool online; ++ ++#ifdef CONFIG_HAVE_SCHED_AVG_IRQ ++ struct sched_avg avg_irq; ++#endif ++ ++ unsigned long queued_level; ++ unsigned long pending_level; ++ ++#ifdef CONFIG_SCHED_SMT ++ int active_balance; ++ struct cpu_stop_work active_balance_work; ++#endif ++#endif /* CONFIG_SMP */ ++#ifdef CONFIG_IRQ_TIME_ACCOUNTING ++ u64 prev_irq_time; ++#endif /* CONFIG_IRQ_TIME_ACCOUNTING */ ++#ifdef CONFIG_PARAVIRT ++ u64 prev_steal_time; ++#endif /* CONFIG_PARAVIRT */ ++#ifdef CONFIG_PARAVIRT_TIME_ACCOUNTING ++ u64 prev_steal_time_rq; ++#endif /* CONFIG_PARAVIRT_TIME_ACCOUNTING */ ++ ++ /* calc_load related fields */ ++ unsigned long calc_load_update; ++ long calc_load_active; ++ ++ u64 clock, last_tick; ++ u64 clock_task; ++ int dither; ++ ++ unsigned long nr_running; ++ unsigned long nr_uninterruptible; ++ ++#ifdef CONFIG_SCHED_HRTICK ++#ifdef CONFIG_SMP ++ int hrtick_csd_pending; ++ call_single_data_t hrtick_csd; ++#endif ++ struct hrtimer hrtick_timer; ++#endif ++ ++#ifdef CONFIG_SCHEDSTATS ++ ++ /* latency stats */ ++ struct sched_info rq_sched_info; ++ unsigned long long rq_cpu_time; ++ /* could above be rq->cfs_rq.exec_clock + rq->rt_rq.rt_runtime ? */ ++ ++ /* sys_sched_yield() stats */ ++ unsigned int yld_count; ++ ++ /* schedule() stats */ ++ unsigned int sched_switch; ++ unsigned int sched_count; ++ unsigned int sched_goidle; ++ ++ /* try_to_wake_up() stats */ ++ unsigned int ttwu_count; ++ unsigned int ttwu_local; ++#endif /* CONFIG_SCHEDSTATS */ ++#ifdef CONFIG_CPU_IDLE ++ /* Must be inspected within a rcu lock section */ ++ struct cpuidle_state *idle_state; ++#endif ++}; ++ ++extern unsigned long calc_load_update; ++extern atomic_long_t calc_load_tasks; ++ ++extern void calc_global_load_tick(struct rq *this_rq); ++extern long calc_load_fold_active(struct rq *this_rq, long adjust); ++ ++#ifndef CONFIG_SMP ++extern struct rq *uprq; ++#define cpu_rq(cpu) (uprq) ++#define this_rq() (uprq) ++#define raw_rq() (uprq) ++#define task_rq(p) (uprq) ++#define cpu_curr(cpu) ((uprq)->curr) ++#else /* CONFIG_SMP */ ++DECLARE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues); ++#define cpu_rq(cpu) (&per_cpu(runqueues, (cpu))) ++#define this_rq() this_cpu_ptr(&runqueues) ++#define raw_rq() raw_cpu_ptr(&runqueues) ++#define task_rq(p) cpu_rq(task_cpu(p)) ++#define cpu_curr(cpu) (cpu_rq(cpu)->curr) ++ ++#if defined(CONFIG_SCHED_DEBUG) && defined(CONFIG_SYSCTL) ++void register_sched_domain_sysctl(void); ++void unregister_sched_domain_sysctl(void); ++#else ++static inline void register_sched_domain_sysctl(void) ++{ ++} ++static inline void unregister_sched_domain_sysctl(void) ++{ ++} ++#endif ++ ++#endif /* CONFIG_SMP */ ++ ++#ifndef arch_scale_freq_capacity ++static __always_inline ++unsigned long arch_scale_freq_capacity(int cpu) ++{ ++ return SCHED_CAPACITY_SCALE; ++} ++#endif ++ ++static inline u64 __rq_clock_broken(struct rq *rq) ++{ ++ return READ_ONCE(rq->clock); ++} ++ ++static inline u64 rq_clock(struct rq *rq) ++{ ++ /* ++ * Relax lockdep_assert_held() checking as in VRQ, call to ++ * sched_info_xxxx() may not held rq->lock ++ * lockdep_assert_held(&rq->lock); ++ */ ++ return rq->clock; ++} ++ ++static inline u64 rq_clock_task(struct rq *rq) ++{ ++ /* ++ * Relax lockdep_assert_held() checking as in VRQ, call to ++ * sched_info_xxxx() may not held rq->lock ++ * lockdep_assert_held(&rq->lock); ++ */ ++ return rq->clock_task; ++} ++ ++/* ++ * {de,en}queue flags: ++ * ++ * DEQUEUE_SLEEP - task is no longer runnable ++ * ENQUEUE_WAKEUP - task just became runnable ++ * ++ */ ++ ++#define DEQUEUE_SLEEP 0x01 ++ ++#define ENQUEUE_WAKEUP 0x01 ++ ++ ++/* ++ * Below are scheduler API which using in other kernel code ++ * It use the dummy rq_flags ++ * ToDo : PDS need to support these APIs for compatibility with mainline ++ * scheduler code. ++ */ ++struct rq_flags { ++ unsigned long flags; ++}; ++ ++struct rq *__task_rq_lock(struct task_struct *p, struct rq_flags *rf) ++ __acquires(rq->lock); ++ ++struct rq *task_rq_lock(struct task_struct *p, struct rq_flags *rf) ++ __acquires(p->pi_lock) ++ __acquires(rq->lock); ++ ++static inline void __task_rq_unlock(struct rq *rq, struct rq_flags *rf) ++ __releases(rq->lock) ++{ ++ raw_spin_unlock(&rq->lock); ++} ++ ++static inline void ++task_rq_unlock(struct rq *rq, struct task_struct *p, struct rq_flags *rf) ++ __releases(rq->lock) ++ __releases(p->pi_lock) ++{ ++ raw_spin_unlock(&rq->lock); ++ raw_spin_unlock_irqrestore(&p->pi_lock, rf->flags); ++} ++ ++static inline void ++rq_unlock_irq(struct rq *rq, struct rq_flags *rf) ++ __releases(rq->lock) ++{ ++ raw_spin_unlock_irq(&rq->lock); ++} ++ ++static inline struct rq * ++this_rq_lock_irq(struct rq_flags *rf) ++ __acquires(rq->lock) ++{ ++ struct rq *rq; ++ ++ local_irq_disable(); ++ rq = this_rq(); ++ raw_spin_lock(&rq->lock); ++ ++ return rq; ++} ++ ++static inline bool task_running(struct task_struct *p) ++{ ++ return p->on_cpu; ++} ++ ++extern struct static_key_false sched_schedstats; ++ ++static inline void sched_ttwu_pending(void) { } ++ ++#ifdef CONFIG_CPU_IDLE ++static inline void idle_set_state(struct rq *rq, ++ struct cpuidle_state *idle_state) ++{ ++ rq->idle_state = idle_state; ++} ++ ++static inline struct cpuidle_state *idle_get_state(struct rq *rq) ++{ ++ WARN_ON(!rcu_read_lock_held()); ++ return rq->idle_state; ++} ++#else ++static inline void idle_set_state(struct rq *rq, ++ struct cpuidle_state *idle_state) ++{ ++} ++ ++static inline struct cpuidle_state *idle_get_state(struct rq *rq) ++{ ++ return NULL; ++} ++#endif ++ ++static inline int cpu_of(const struct rq *rq) ++{ ++#ifdef CONFIG_SMP ++ return rq->cpu; ++#else ++ return 0; ++#endif ++} ++ ++#include "stats.h" ++ ++#ifdef CONFIG_IRQ_TIME_ACCOUNTING ++struct irqtime { ++ u64 total; ++ u64 tick_delta; ++ u64 irq_start_time; ++ struct u64_stats_sync sync; ++}; ++ ++DECLARE_PER_CPU(struct irqtime, cpu_irqtime); ++ ++/* ++ * Returns the irqtime minus the softirq time computed by ksoftirqd. ++ * Otherwise ksoftirqd's sum_exec_runtime is substracted its own runtime ++ * and never move forward. ++ */ ++static inline u64 irq_time_read(int cpu) ++{ ++ struct irqtime *irqtime = &per_cpu(cpu_irqtime, cpu); ++ unsigned int seq; ++ u64 total; ++ ++ do { ++ seq = __u64_stats_fetch_begin(&irqtime->sync); ++ total = irqtime->total; ++ } while (__u64_stats_fetch_retry(&irqtime->sync, seq)); ++ ++ return total; ++} ++#endif /* CONFIG_IRQ_TIME_ACCOUNTING */ ++ ++#ifdef CONFIG_CPU_FREQ ++DECLARE_PER_CPU(struct update_util_data __rcu *, cpufreq_update_util_data); ++ ++/** ++ * cpufreq_update_util - Take a note about CPU utilization changes. ++ * @rq: Runqueue to carry out the update for. ++ * @flags: Update reason flags. ++ * ++ * This function is called by the scheduler on the CPU whose utilization is ++ * being updated. ++ * ++ * It can only be called from RCU-sched read-side critical sections. ++ * ++ * The way cpufreq is currently arranged requires it to evaluate the CPU ++ * performance state (frequency/voltage) on a regular basis to prevent it from ++ * being stuck in a completely inadequate performance level for too long. ++ * That is not guaranteed to happen if the updates are only triggered from CFS ++ * and DL, though, because they may not be coming in if only RT tasks are ++ * active all the time (or there are RT tasks only). ++ * ++ * As a workaround for that issue, this function is called periodically by the ++ * RT sched class to trigger extra cpufreq updates to prevent it from stalling, ++ * but that really is a band-aid. Going forward it should be replaced with ++ * solutions targeted more specifically at RT tasks. ++ */ ++static inline void cpufreq_update_util(struct rq *rq, unsigned int flags) ++{ ++ struct update_util_data *data; ++ ++ data = rcu_dereference_sched(*this_cpu_ptr(&cpufreq_update_util_data)); ++ if (data) ++ data->func(data, rq_clock(rq), flags); ++} ++ ++static inline void cpufreq_update_this_cpu(struct rq *rq, unsigned int flags) ++{ ++ if (cpu_of(rq) == smp_processor_id()) ++ cpufreq_update_util(rq, flags); ++} ++#else ++static inline void cpufreq_update_util(struct rq *rq, unsigned int flags) {} ++static inline void cpufreq_update_this_cpu(struct rq *rq, unsigned int flags) {} ++#endif /* CONFIG_CPU_FREQ */ ++ ++#ifdef CONFIG_NO_HZ_FULL ++extern int __init sched_tick_offload_init(void); ++#else ++static inline int sched_tick_offload_init(void) { return 0; } ++#endif ++ ++#ifdef arch_scale_freq_capacity ++#ifndef arch_scale_freq_invariant ++#define arch_scale_freq_invariant() (true) ++#endif ++#else /* arch_scale_freq_capacity */ ++#define arch_scale_freq_invariant() (false) ++#endif ++ ++extern void schedule_idle(void); ++ ++/* ++ * !! For sched_setattr_nocheck() (kernel) only !! ++ * ++ * This is actually gross. :( ++ * ++ * It is used to make schedutil kworker(s) higher priority than SCHED_DEADLINE ++ * tasks, but still be able to sleep. We need this on platforms that cannot ++ * atomically change clock frequency. Remove once fast switching will be ++ * available on such platforms. ++ * ++ * SUGOV stands for SchedUtil GOVernor. ++ */ ++#define SCHED_FLAG_SUGOV 0x10000000 ++ ++#ifdef CONFIG_MEMBARRIER ++/* ++ * The scheduler provides memory barriers required by membarrier between: ++ * - prior user-space memory accesses and store to rq->membarrier_state, ++ * - store to rq->membarrier_state and following user-space memory accesses. ++ * In the same way it provides those guarantees around store to rq->curr. ++ */ ++static inline void membarrier_switch_mm(struct rq *rq, ++ struct mm_struct *prev_mm, ++ struct mm_struct *next_mm) ++{ ++ int membarrier_state; ++ ++ if (prev_mm == next_mm) ++ return; ++ ++ membarrier_state = atomic_read(&next_mm->membarrier_state); ++ if (READ_ONCE(rq->membarrier_state) == membarrier_state) ++ return; ++ ++ WRITE_ONCE(rq->membarrier_state, membarrier_state); ++} ++#else ++static inline void membarrier_switch_mm(struct rq *rq, ++ struct mm_struct *prev_mm, ++ struct mm_struct *next_mm) ++{ ++} ++#endif ++ ++#ifdef CONFIG_NUMA ++extern int sched_numa_find_closest(const struct cpumask *cpus, int cpu); ++#else ++static inline int sched_numa_find_closest(const struct cpumask *cpus, int cpu) ++{ ++ return nr_cpu_ids; ++} ++#endif ++#endif /* PDS_SCHED_H */ +diff --git a/kernel/sched/pelt.c b/kernel/sched/pelt.c +index a96db50d40e0..d3d12baa9036 100644 +--- a/kernel/sched/pelt.c ++++ b/kernel/sched/pelt.c +@@ -236,6 +236,7 @@ ___update_load_avg(struct sched_avg *sa, unsigned long load, unsigned long runna + WRITE_ONCE(sa->util_avg, sa->util_sum / divider); + } + ++#ifndef CONFIG_SCHED_PDS + /* + * sched_entity: + * +@@ -352,6 +353,7 @@ int update_dl_rq_load_avg(u64 now, struct rq *rq, int running) + + return 0; + } ++#endif + + #ifdef CONFIG_HAVE_SCHED_AVG_IRQ + /* +diff --git a/kernel/sched/pelt.h b/kernel/sched/pelt.h +index afff644da065..26d6b47fc156 100644 +--- a/kernel/sched/pelt.h ++++ b/kernel/sched/pelt.h +@@ -1,11 +1,13 @@ + #ifdef CONFIG_SMP + #include "sched-pelt.h" + ++#ifndef CONFIG_SCHED_PDS + int __update_load_avg_blocked_se(u64 now, struct sched_entity *se); + int __update_load_avg_se(u64 now, struct cfs_rq *cfs_rq, struct sched_entity *se); + int __update_load_avg_cfs_rq(u64 now, struct cfs_rq *cfs_rq); + int update_rt_rq_load_avg(u64 now, struct rq *rq, int running); + int update_dl_rq_load_avg(u64 now, struct rq *rq, int running); ++#endif + + #ifdef CONFIG_HAVE_SCHED_AVG_IRQ + int update_irq_load_avg(struct rq *rq, u64 running); +@@ -17,6 +19,7 @@ update_irq_load_avg(struct rq *rq, u64 running) + } + #endif + ++#ifndef CONFIG_SCHED_PDS + /* + * When a task is dequeued, its estimated utilization should not be update if + * its util_avg has not been updated at least once. +@@ -137,9 +140,11 @@ static inline u64 cfs_rq_clock_pelt(struct cfs_rq *cfs_rq) + return rq_clock_pelt(rq_of(cfs_rq)); + } + #endif ++#endif /* CONFIG_SCHED_PDS */ + + #else + ++#ifndef CONFIG_SCHED_PDS + static inline int + update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq) + { +@@ -157,6 +162,7 @@ update_dl_rq_load_avg(u64 now, struct rq *rq, int running) + { + return 0; + } ++#endif + + static inline int + update_irq_load_avg(struct rq *rq, u64 running) +diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h +index c8870c5bd7df..4fc9f2ead4d2 100644 +--- a/kernel/sched/sched.h ++++ b/kernel/sched/sched.h +@@ -2,6 +2,10 @@ + /* + * Scheduler internal types and methods: + */ ++#ifdef CONFIG_SCHED_PDS ++#include "pds_sched.h" ++#else ++ + #include + + #include +@@ -2496,3 +2500,4 @@ static inline void membarrier_switch_mm(struct rq *rq, + { + } + #endif ++#endif /* !CONFIG_SCHED_PDS */ +diff --git a/kernel/sched/stats.c b/kernel/sched/stats.c +index 750fb3c67eed..45bd43942575 100644 +--- a/kernel/sched/stats.c ++++ b/kernel/sched/stats.c +@@ -22,8 +22,10 @@ static int show_schedstat(struct seq_file *seq, void *v) + } else { + struct rq *rq; + #ifdef CONFIG_SMP ++#ifndef CONFIG_SCHED_PDS + struct sched_domain *sd; + int dcount = 0; ++#endif + #endif + cpu = (unsigned long)(v - 2); + rq = cpu_rq(cpu); +@@ -40,6 +42,7 @@ static int show_schedstat(struct seq_file *seq, void *v) + seq_printf(seq, "\n"); + + #ifdef CONFIG_SMP ++#ifndef CONFIG_SCHED_PDS + /* domain-specific stats */ + rcu_read_lock(); + for_each_domain(cpu, sd) { +@@ -68,6 +71,7 @@ static int show_schedstat(struct seq_file *seq, void *v) + sd->ttwu_move_balance); + } + rcu_read_unlock(); ++#endif + #endif + } + return 0; +diff --git a/kernel/sysctl.c b/kernel/sysctl.c +index b6f2f35d0bcf..204933ebc95a 100644 +--- a/kernel/sysctl.c ++++ b/kernel/sysctl.c +@@ -130,8 +130,12 @@ static int __maybe_unused four = 4; + static unsigned long zero_ul; + static unsigned long one_ul = 1; + static unsigned long long_max = LONG_MAX; +-static int one_hundred = 100; +-static int one_thousand = 1000; ++static int __read_mostly one_hundred = 100; ++static int __read_mostly one_thousand = 1000; ++#ifdef CONFIG_SCHED_PDS ++extern int rr_interval; ++extern int sched_yield_type; ++#endif + #ifdef CONFIG_PRINTK + static int ten_thousand = 10000; + #endif +@@ -300,7 +304,7 @@ static struct ctl_table sysctl_base_table[] = { + { } + }; + +-#ifdef CONFIG_SCHED_DEBUG ++#if defined(CONFIG_SCHED_DEBUG) && !defined(CONFIG_SCHED_PDS) + static int min_sched_granularity_ns = 100000; /* 100 usecs */ + static int max_sched_granularity_ns = NSEC_PER_SEC; /* 1 second */ + static int min_wakeup_granularity_ns; /* 0 usecs */ +@@ -317,6 +321,7 @@ static int max_extfrag_threshold = 1000; + #endif + + static struct ctl_table kern_table[] = { ++#ifndef CONFIG_SCHED_PDS + { + .procname = "sched_child_runs_first", + .data = &sysctl_sched_child_runs_first, +@@ -498,6 +503,7 @@ static struct ctl_table kern_table[] = { + .extra2 = SYSCTL_ONE, + }, + #endif ++#endif /* !CONFIG_SCHED_PDS */ + #ifdef CONFIG_PROVE_LOCKING + { + .procname = "prove_locking", +@@ -1070,6 +1076,26 @@ static struct ctl_table kern_table[] = { + .proc_handler = proc_dointvec, + }, + #endif ++#ifdef CONFIG_SCHED_PDS ++ { ++ .procname = "rr_interval", ++ .data = &rr_interval, ++ .maxlen = sizeof (int), ++ .mode = 0644, ++ .proc_handler = &proc_dointvec_minmax, ++ .extra1 = SYSCTL_ONE, ++ .extra2 = &one_thousand, ++ }, ++ { ++ .procname = "yield_type", ++ .data = &sched_yield_type, ++ .maxlen = sizeof (int), ++ .mode = 0644, ++ .proc_handler = &proc_dointvec_minmax, ++ .extra1 = SYSCTL_ZERO, ++ .extra2 = &two, ++ }, ++#endif + #if defined(CONFIG_S390) && defined(CONFIG_SMP) + { + .procname = "spin_retry", +diff --git a/kernel/time/posix-cpu-timers.c b/kernel/time/posix-cpu-timers.c +index 42d512fcfda2..71af3cd30ccc 100644 +--- a/kernel/time/posix-cpu-timers.c ++++ b/kernel/time/posix-cpu-timers.c +@@ -226,7 +226,7 @@ static void task_sample_cputime(struct task_struct *p, u64 *samples) + u64 stime, utime; + + task_cputime(p, &utime, &stime); +- store_samples(samples, stime, utime, p->se.sum_exec_runtime); ++ store_samples(samples, stime, utime, tsk_seruntime(p)); + } + + static void proc_sample_cputime_atomic(struct task_cputime_atomic *at, +@@ -796,6 +796,7 @@ static void collect_posix_cputimers(struct posix_cputimers *pct, u64 *samples, + } + } + ++#ifndef CONFIG_SCHED_PDS + static inline void check_dl_overrun(struct task_struct *tsk) + { + if (tsk->dl.dl_overrun) { +@@ -803,6 +804,7 @@ static inline void check_dl_overrun(struct task_struct *tsk) + __group_send_sig_info(SIGXCPU, SEND_SIG_PRIV, tsk); + } + } ++#endif + + static bool check_rlimit(u64 time, u64 limit, int signo, bool rt, bool hard) + { +@@ -830,8 +832,10 @@ static void check_thread_timers(struct task_struct *tsk, + u64 samples[CPUCLOCK_MAX]; + unsigned long soft; + ++#ifndef CONFIG_SCHED_PDS + if (dl_task(tsk)) + check_dl_overrun(tsk); ++#endif + + if (expiry_cache_is_inactive(pct)) + return; +@@ -845,7 +849,7 @@ static void check_thread_timers(struct task_struct *tsk, + soft = task_rlimit(tsk, RLIMIT_RTTIME); + if (soft != RLIM_INFINITY) { + /* Task RT timeout is accounted in jiffies. RTTIME is usec */ +- unsigned long rttime = tsk->rt.timeout * (USEC_PER_SEC / HZ); ++ unsigned long rttime = tsk_rttimeout(tsk) * (USEC_PER_SEC / HZ); + unsigned long hard = task_rlimit_max(tsk, RLIMIT_RTTIME); + + /* At the hard limit, send SIGKILL. No further action. */ +@@ -1099,8 +1103,10 @@ static inline bool fastpath_timer_check(struct task_struct *tsk) + return true; + } + ++#ifndef CONFIG_SCHED_PDS + if (dl_task(tsk) && tsk->dl.dl_overrun) + return true; ++#endif + + return false; + } +diff --git a/kernel/trace/trace_selftest.c b/kernel/trace/trace_selftest.c +index 69ee8ef12cee..3eaa2a21caa4 100644 +--- a/kernel/trace/trace_selftest.c ++++ b/kernel/trace/trace_selftest.c +@@ -1048,10 +1048,15 @@ static int trace_wakeup_test_thread(void *data) + { + /* Make this a -deadline thread */ + static const struct sched_attr attr = { ++#ifdef CONFIG_SCHED_PDS ++ /* No deadline on BFS, use RR */ ++ .sched_policy = SCHED_RR, ++#else + .sched_policy = SCHED_DEADLINE, + .sched_runtime = 100000ULL, + .sched_deadline = 10000000ULL, + .sched_period = 10000000ULL ++#endif + }; + struct wakeup_test_data *x = data; + diff --git a/linux56-rc-tkg/linux56-tkg-patches/0006-add-acs-overrides_iommu.patch b/linux56-rc-tkg/linux56-tkg-patches/0006-add-acs-overrides_iommu.patch new file mode 100644 index 0000000..7425de1 --- /dev/null +++ b/linux56-rc-tkg/linux56-tkg-patches/0006-add-acs-overrides_iommu.patch @@ -0,0 +1,192 @@ +From cdeab384f48dd9c88e2dff2e9ad8d57dca1a1b1c Mon Sep 17 00:00:00 2001 +From: Mark Weiman +Date: Sun, 12 Aug 2018 11:36:21 -0400 +Subject: [PATCH] pci: Enable overrides for missing ACS capabilities + +This an updated version of Alex Williamson's patch from: +https://lkml.org/lkml/2013/5/30/513 + +Original commit message follows: + +PCIe ACS (Access Control Services) is the PCIe 2.0+ feature that +allows us to control whether transactions are allowed to be redirected +in various subnodes of a PCIe topology. For instance, if two +endpoints are below a root port or downsteam switch port, the +downstream port may optionally redirect transactions between the +devices, bypassing upstream devices. The same can happen internally +on multifunction devices. The transaction may never be visible to the +upstream devices. + +One upstream device that we particularly care about is the IOMMU. If +a redirection occurs in the topology below the IOMMU, then the IOMMU +cannot provide isolation between devices. This is why the PCIe spec +encourages topologies to include ACS support. Without it, we have to +assume peer-to-peer DMA within a hierarchy can bypass IOMMU isolation. + +Unfortunately, far too many topologies do not support ACS to make this +a steadfast requirement. Even the latest chipsets from Intel are only +sporadically supporting ACS. We have trouble getting interconnect +vendors to include the PCIe spec required PCIe capability, let alone +suggested features. + +Therefore, we need to add some flexibility. The pcie_acs_override= +boot option lets users opt-in specific devices or sets of devices to +assume ACS support. The "downstream" option assumes full ACS support +on root ports and downstream switch ports. The "multifunction" +option assumes the subset of ACS features available on multifunction +endpoints and upstream switch ports are supported. The "id:nnnn:nnnn" +option enables ACS support on devices matching the provided vendor +and device IDs, allowing more strategic ACS overrides. These options +may be combined in any order. A maximum of 16 id specific overrides +are available. It's suggested to use the most limited set of options +necessary to avoid completely disabling ACS across the topology. +Note to hardware vendors, we have facilities to permanently quirk +specific devices which enforce isolation but not provide an ACS +capability. Please contact me to have your devices added and save +your customers the hassle of this boot option. + +Signed-off-by: Mark Weiman +--- + .../admin-guide/kernel-parameters.txt | 9 ++ + drivers/pci/quirks.c | 101 ++++++++++++++++++ + 2 files changed, 110 insertions(+) + +diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt +index aefd358a5ca3..173b3596fd9e 100644 +--- a/Documentation/admin-guide/kernel-parameters.txt ++++ b/Documentation/admin-guide/kernel-parameters.txt +@@ -3190,6 +3190,15 @@ + nomsi [MSI] If the PCI_MSI kernel config parameter is + enabled, this kernel boot option can be used to + disable the use of MSI interrupts system-wide. ++ pcie_acs_override = ++ [PCIE] Override missing PCIe ACS support for: ++ downstream ++ All downstream ports - full ACS capabilities ++ multifunction ++ All multifunction devices - multifunction ACS subset ++ id:nnnn:nnnn ++ Specific device - full ACS capabilities ++ Specified as vid:did (vendor/device ID) in hex + noioapicquirk [APIC] Disable all boot interrupt quirks. + Safety option to keep boot IRQs enabled. This + should never be necessary. +diff --git a/drivers/pci/quirks.c b/drivers/pci/quirks.c +index 4700d24e5d55..8f7a3d7fd9c1 100644 +--- a/drivers/pci/quirks.c ++++ b/drivers/pci/quirks.c +@@ -3372,6 +3372,106 @@ static void quirk_no_bus_reset(struct pci_dev *dev) + dev->dev_flags |= PCI_DEV_FLAGS_NO_BUS_RESET; + } + ++static bool acs_on_downstream; ++static bool acs_on_multifunction; ++ ++#define NUM_ACS_IDS 16 ++struct acs_on_id { ++ unsigned short vendor; ++ unsigned short device; ++}; ++static struct acs_on_id acs_on_ids[NUM_ACS_IDS]; ++static u8 max_acs_id; ++ ++static __init int pcie_acs_override_setup(char *p) ++{ ++ if (!p) ++ return -EINVAL; ++ ++ while (*p) { ++ if (!strncmp(p, "downstream", 10)) ++ acs_on_downstream = true; ++ if (!strncmp(p, "multifunction", 13)) ++ acs_on_multifunction = true; ++ if (!strncmp(p, "id:", 3)) { ++ char opt[5]; ++ int ret; ++ long val; ++ ++ if (max_acs_id >= NUM_ACS_IDS - 1) { ++ pr_warn("Out of PCIe ACS override slots (%d)\n", ++ NUM_ACS_IDS); ++ goto next; ++ } ++ ++ p += 3; ++ snprintf(opt, 5, "%s", p); ++ ret = kstrtol(opt, 16, &val); ++ if (ret) { ++ pr_warn("PCIe ACS ID parse error %d\n", ret); ++ goto next; ++ } ++ acs_on_ids[max_acs_id].vendor = val; ++ ++ p += strcspn(p, ":"); ++ if (*p != ':') { ++ pr_warn("PCIe ACS invalid ID\n"); ++ goto next; ++ } ++ ++ p++; ++ snprintf(opt, 5, "%s", p); ++ ret = kstrtol(opt, 16, &val); ++ if (ret) { ++ pr_warn("PCIe ACS ID parse error %d\n", ret); ++ goto next; ++ } ++ acs_on_ids[max_acs_id].device = val; ++ max_acs_id++; ++ } ++next: ++ p += strcspn(p, ","); ++ if (*p == ',') ++ p++; ++ } ++ ++ if (acs_on_downstream || acs_on_multifunction || max_acs_id) ++ pr_warn("Warning: PCIe ACS overrides enabled; This may allow non-IOMMU protected peer-to-peer DMA\n"); ++ ++ return 0; ++} ++early_param("pcie_acs_override", pcie_acs_override_setup); ++ ++static int pcie_acs_overrides(struct pci_dev *dev, u16 acs_flags) ++{ ++ int i; ++ ++ /* Never override ACS for legacy devices or devices with ACS caps */ ++ if (!pci_is_pcie(dev) || ++ pci_find_ext_capability(dev, PCI_EXT_CAP_ID_ACS)) ++ return -ENOTTY; ++ ++ for (i = 0; i < max_acs_id; i++) ++ if (acs_on_ids[i].vendor == dev->vendor && ++ acs_on_ids[i].device == dev->device) ++ return 1; ++ ++ switch (pci_pcie_type(dev)) { ++ case PCI_EXP_TYPE_DOWNSTREAM: ++ case PCI_EXP_TYPE_ROOT_PORT: ++ if (acs_on_downstream) ++ return 1; ++ break; ++ case PCI_EXP_TYPE_ENDPOINT: ++ case PCI_EXP_TYPE_UPSTREAM: ++ case PCI_EXP_TYPE_LEG_END: ++ case PCI_EXP_TYPE_RC_END: ++ if (acs_on_multifunction && dev->multifunction) ++ return 1; ++ } ++ ++ return -ENOTTY; ++} + /* + * Some Atheros AR9xxx and QCA988x chips do not behave after a bus reset. + * The device will throw a Link Down error on AER-capable systems and +@@ -4513,6 +4613,7 @@ static const struct pci_dev_acs_enabled { + { PCI_VENDOR_ID_BROADCOM, 0xD714, pci_quirk_brcm_acs }, + /* Amazon Annapurna Labs */ + { PCI_VENDOR_ID_AMAZON_ANNAPURNA_LABS, 0x0031, pci_quirk_al_acs }, ++ { PCI_ANY_ID, PCI_ANY_ID, pcie_acs_overrides }, + { 0 } + }; + diff --git a/linux56-rc-tkg/linux56-tkg-patches/0007-v5.6-fsync.patch b/linux56-rc-tkg/linux56-tkg-patches/0007-v5.6-fsync.patch new file mode 100644 index 0000000..027116f --- /dev/null +++ b/linux56-rc-tkg/linux56-tkg-patches/0007-v5.6-fsync.patch @@ -0,0 +1,419 @@ +split the futex key setup from the queue locking and key reading. This +is useful to support the setup of multiple keys at the same time, like +what is done in futex_requeue() and what will be done for the +FUTEX_WAIT_MULTIPLE command. + +Signed-off-by: Gabriel Krisman Bertazi +--- + kernel/futex.c | 71 +++++++++++++++++++++++++++++--------------------- + 1 file changed, 42 insertions(+), 29 deletions(-) + +diff --git a/kernel/futex.c b/kernel/futex.c +index 6d50728ef2e7..91f3db335c57 100644 +--- a/kernel/futex.c ++++ b/kernel/futex.c +@@ -2631,6 +2631,39 @@ static void futex_wait_queue_me(struct futex_hash_bucket *hb, struct futex_q *q, + __set_current_state(TASK_RUNNING); + } + ++static int __futex_wait_setup(u32 __user *uaddr, u32 val, unsigned int flags, ++ struct futex_q *q, struct futex_hash_bucket **hb) ++{ ++ ++ u32 uval; ++ int ret; ++ ++retry_private: ++ *hb = queue_lock(q); ++ ++ ret = get_futex_value_locked(&uval, uaddr); ++ ++ if (ret) { ++ queue_unlock(*hb); ++ ++ ret = get_user(uval, uaddr); ++ if (ret) ++ return ret; ++ ++ if (!(flags & FLAGS_SHARED)) ++ goto retry_private; ++ ++ return 1; ++ } ++ ++ if (uval != val) { ++ queue_unlock(*hb); ++ ret = -EWOULDBLOCK; ++ } ++ ++ return ret; ++} ++ + /** + * futex_wait_setup() - Prepare to wait on a futex + * @uaddr: the futex userspace address +@@ -2651,7 +2684,6 @@ static void futex_wait_queue_me(struct futex_hash_bucket *hb, struct futex_q *q, + static int futex_wait_setup(u32 __user *uaddr, u32 val, unsigned int flags, + struct futex_q *q, struct futex_hash_bucket **hb) + { +- u32 uval; + int ret; + + /* +@@ -2672,38 +2704,19 @@ static int futex_wait_setup(u32 __user *uaddr, u32 val, unsigned int flags, + * absorb a wakeup if *uaddr does not match the desired values + * while the syscall executes. + */ +-retry: +- ret = get_futex_key(uaddr, flags & FLAGS_SHARED, &q->key, FUTEX_READ); +- if (unlikely(ret != 0)) +- return ret; +- +-retry_private: +- *hb = queue_lock(q); ++ do { ++ ret = get_futex_key(uaddr, flags & FLAGS_SHARED, ++ &q->key, FUTEX_READ); ++ if (unlikely(ret != 0)) ++ return ret; + +- ret = get_futex_value_locked(&uval, uaddr); ++ ret = __futex_wait_setup(uaddr, val, flags, q, hb); + +- if (ret) { +- queue_unlock(*hb); +- +- ret = get_user(uval, uaddr); ++ /* Drop key reference if retry or error. */ + if (ret) +- goto out; ++ put_futex_key(&q->key); ++ } while (ret > 0); + +- if (!(flags & FLAGS_SHARED)) +- goto retry_private; +- +- put_futex_key(&q->key); +- goto retry; +- } +- +- if (uval != val) { +- queue_unlock(*hb); +- ret = -EWOULDBLOCK; +- } +- +-out: +- if (ret) +- put_futex_key(&q->key); + return ret; + } + +-- +2.20.1 + +This is a new futex operation, called FUTEX_WAIT_MULTIPLE, which allows +a thread to wait on several futexes at the same time, and be awoken by +any of them. In a sense, it implements one of the features that was +supported by pooling on the old FUTEX_FD interface. + +My use case for this operation lies in Wine, where we want to implement +a similar interface available in Windows, used mainly for event +handling. The wine folks have an implementation that uses eventfd, but +it suffers from FD exhaustion (I was told they have application that go +to the order of multi-milion FDs), and higher CPU utilization. + +In time, we are also proposing modifications to glibc and libpthread to +make this feature available for Linux native multithreaded applications +using libpthread, which can benefit from the behavior of waiting on any +of a group of futexes. + +In particular, using futexes in our Wine use case reduced the CPU +utilization by 4% for the game Beat Saber and by 1.5% for the game +Shadow of Tomb Raider, both running over Proton (a wine based solution +for Windows emulation), when compared to the eventfd interface. This +implementation also doesn't rely of file descriptors, so it doesn't risk +overflowing the resource. + +Technically, the existing FUTEX_WAIT implementation can be easily +reworked by using do_futex_wait_multiple with a count of one, and I +have a patch showing how it works. I'm not proposing it, since +futex is such a tricky code, that I'd be more confortable to have +FUTEX_WAIT_MULTIPLE running upstream for a couple development cycles, +before considering modifying FUTEX_WAIT. + +From an implementation perspective, the futex list is passed as an array +of (pointer,value,bitset) to the kernel, which will enqueue all of them +and sleep if none was already triggered. It returns a hint of which +futex caused the wake up event to userspace, but the hint doesn't +guarantee that is the only futex triggered. Before calling the syscall +again, userspace should traverse the list, trying to re-acquire any of +the other futexes, to prevent an immediate -EWOULDBLOCK return code from +the kernel. + +This was tested using three mechanisms: + +1) By reimplementing FUTEX_WAIT in terms of FUTEX_WAIT_MULTIPLE and +running the unmodified tools/testing/selftests/futex and a full linux +distro on top of this kernel. + +2) By an example code that exercises the FUTEX_WAIT_MULTIPLE path on a +multi-threaded, event-handling setup. + +3) By running the Wine fsync implementation and executing multi-threaded +applications, in particular the modern games mentioned above, on top of +this implementation. + +Signed-off-by: Zebediah Figura +Signed-off-by: Steven Noonan +Signed-off-by: Pierre-Loup A. Griffais +Signed-off-by: Gabriel Krisman Bertazi +--- + include/uapi/linux/futex.h | 7 ++ + kernel/futex.c | 161 ++++++++++++++++++++++++++++++++++++- + 2 files changed, 164 insertions(+), 4 deletions(-) + +diff --git a/include/uapi/linux/futex.h b/include/uapi/linux/futex.h +index a89eb0accd5e..2401c4cf5095 100644 +--- a/include/uapi/linux/futex.h ++++ b/include/uapi/linux/futex.h +@@ -21,6 +21,7 @@ + #define FUTEX_WAKE_BITSET 10 + #define FUTEX_WAIT_REQUEUE_PI 11 + #define FUTEX_CMP_REQUEUE_PI 12 ++#define FUTEX_WAIT_MULTIPLE 31 + + #define FUTEX_PRIVATE_FLAG 128 + #define FUTEX_CLOCK_REALTIME 256 +@@ -150,4 +151,10 @@ struct robust_list_head { + (((op & 0xf) << 28) | ((cmp & 0xf) << 24) \ + | ((oparg & 0xfff) << 12) | (cmparg & 0xfff)) + ++struct futex_wait_block { ++ __u32 __user *uaddr; ++ __u32 val; ++ __u32 bitset; ++}; ++ + #endif /* _UAPI_LINUX_FUTEX_H */ +diff --git a/kernel/futex.c b/kernel/futex.c +index 91f3db335c57..2623e8f152cd 100644 +--- a/kernel/futex.c ++++ b/kernel/futex.c +@@ -183,6 +183,7 @@ static int __read_mostly futex_cmpxchg_enabled; + #endif + #define FLAGS_CLOCKRT 0x02 + #define FLAGS_HAS_TIMEOUT 0x04 ++#define FLAGS_WAKE_MULTIPLE 0x08 + + /* + * Priority Inheritance state: +@@ -2720,6 +2721,150 @@ static int futex_wait_setup(u32 __user *uaddr, u32 val, unsigned int flags, + return ret; + } + ++static int do_futex_wait_multiple(struct futex_wait_block *wb, ++ u32 count, unsigned int flags, ++ ktime_t *abs_time) ++{ ++ ++ struct hrtimer_sleeper timeout, *to; ++ struct futex_hash_bucket *hb; ++ struct futex_q *qs = NULL; ++ int ret; ++ int i; ++ ++ qs = kcalloc(count, sizeof(struct futex_q), GFP_KERNEL); ++ if (!qs) ++ return -ENOMEM; ++ ++ to = futex_setup_timer(abs_time, &timeout, flags, ++ current->timer_slack_ns); ++ retry: ++ for (i = 0; i < count; i++) { ++ qs[i].key = FUTEX_KEY_INIT; ++ qs[i].bitset = wb[i].bitset; ++ ++ ret = get_futex_key(wb[i].uaddr, flags & FLAGS_SHARED, ++ &qs[i].key, FUTEX_READ); ++ if (unlikely(ret != 0)) { ++ for (--i; i >= 0; i--) ++ put_futex_key(&qs[i].key); ++ goto out; ++ } ++ } ++ ++ set_current_state(TASK_INTERRUPTIBLE); ++ ++ for (i = 0; i < count; i++) { ++ ret = __futex_wait_setup(wb[i].uaddr, wb[i].val, ++ flags, &qs[i], &hb); ++ if (ret) { ++ /* Drop the failed key directly. keys 0..(i-1) ++ * will be put by unqueue_me. ++ */ ++ put_futex_key(&qs[i].key); ++ ++ /* Undo the partial work we did. */ ++ for (--i; i >= 0; i--) ++ unqueue_me(&qs[i]); ++ ++ __set_current_state(TASK_RUNNING); ++ if (ret > 0) ++ goto retry; ++ goto out; ++ } ++ ++ /* We can't hold to the bucket lock when dealing with ++ * the next futex. Queue ourselves now so we can unlock ++ * it before moving on. ++ */ ++ queue_me(&qs[i], hb); ++ } ++ ++ if (to) ++ hrtimer_start_expires(&to->timer, HRTIMER_MODE_ABS); ++ ++ /* There is no easy to way to check if we are wake already on ++ * multiple futexes without waking through each one of them. So ++ * just sleep and let the scheduler handle it. ++ */ ++ if (!to || to->task) ++ freezable_schedule(); ++ ++ __set_current_state(TASK_RUNNING); ++ ++ ret = -ETIMEDOUT; ++ /* If we were woken (and unqueued), we succeeded. */ ++ for (i = 0; i < count; i++) ++ if (!unqueue_me(&qs[i])) ++ ret = i; ++ ++ /* Succeed wakeup */ ++ if (ret >= 0) ++ goto out; ++ ++ /* Woken by triggered timeout */ ++ if (to && !to->task) ++ goto out; ++ ++ /* ++ * We expect signal_pending(current), but we might be the ++ * victim of a spurious wakeup as well. ++ */ ++ if (!signal_pending(current)) ++ goto retry; ++ ++ ret = -ERESTARTSYS; ++ if (!abs_time) ++ goto out; ++ ++ ret = -ERESTART_RESTARTBLOCK; ++ out: ++ if (to) { ++ hrtimer_cancel(&to->timer); ++ destroy_hrtimer_on_stack(&to->timer); ++ } ++ ++ kfree(qs); ++ return ret; ++} ++ ++static int futex_wait_multiple(u32 __user *uaddr, unsigned int flags, ++ u32 count, ktime_t *abs_time) ++{ ++ struct futex_wait_block *wb; ++ struct restart_block *restart; ++ int ret; ++ ++ if (!count) ++ return -EINVAL; ++ ++ wb = kcalloc(count, sizeof(struct futex_wait_block), GFP_KERNEL); ++ if (!wb) ++ return -ENOMEM; ++ ++ if (copy_from_user(wb, uaddr, ++ count * sizeof(struct futex_wait_block))) { ++ ret = -EFAULT; ++ goto out; ++ } ++ ++ ret = do_futex_wait_multiple(wb, count, flags, abs_time); ++ ++ if (ret == -ERESTART_RESTARTBLOCK) { ++ restart = ¤t->restart_block; ++ restart->fn = futex_wait_restart; ++ restart->futex.uaddr = uaddr; ++ restart->futex.val = count; ++ restart->futex.time = *abs_time; ++ restart->futex.flags = (flags | FLAGS_HAS_TIMEOUT | ++ FLAGS_WAKE_MULTIPLE); ++ } ++ ++out: ++ kfree(wb); ++ return ret; ++} ++ + static int futex_wait(u32 __user *uaddr, unsigned int flags, u32 val, + ktime_t *abs_time, u32 bitset) + { +@@ -2797,6 +2942,10 @@ static long futex_wait_restart(struct restart_block *restart) + } + restart->fn = do_no_restart_syscall; + ++ if (restart->futex.flags & FLAGS_WAKE_MULTIPLE) ++ return (long)futex_wait_multiple(uaddr, restart->futex.flags, ++ restart->futex.val, tp); ++ + return (long)futex_wait(uaddr, restart->futex.flags, + restart->futex.val, tp, restart->futex.bitset); + } +@@ -3680,6 +3829,8 @@ long do_futex(u32 __user *uaddr, int op, u32 val, ktime_t *timeout, + uaddr2); + case FUTEX_CMP_REQUEUE_PI: + return futex_requeue(uaddr, flags, uaddr2, val, val2, &val3, 1); ++ case FUTEX_WAIT_MULTIPLE: ++ return futex_wait_multiple(uaddr, flags, val, timeout); + } + return -ENOSYS; + } +@@ -3696,7 +3847,8 @@ SYSCALL_DEFINE6(futex, u32 __user *, uaddr, int, op, u32, val, + + if (utime && (cmd == FUTEX_WAIT || cmd == FUTEX_LOCK_PI || + cmd == FUTEX_WAIT_BITSET || +- cmd == FUTEX_WAIT_REQUEUE_PI)) { ++ cmd == FUTEX_WAIT_REQUEUE_PI || ++ cmd == FUTEX_WAIT_MULTIPLE)) { + if (unlikely(should_fail_futex(!(op & FUTEX_PRIVATE_FLAG)))) + return -EFAULT; + if (get_timespec64(&ts, utime)) +@@ -3705,7 +3857,7 @@ SYSCALL_DEFINE6(futex, u32 __user *, uaddr, int, op, u32, val, + return -EINVAL; + + t = timespec64_to_ktime(ts); +- if (cmd == FUTEX_WAIT) ++ if (cmd == FUTEX_WAIT || cmd == FUTEX_WAIT_MULTIPLE) + t = ktime_add_safe(ktime_get(), t); + tp = &t; + } +@@ -3889,14 +4041,15 @@ SYSCALL_DEFINE6(futex_time32, u32 __user *, uaddr, int, op, u32, val, + + if (utime && (cmd == FUTEX_WAIT || cmd == FUTEX_LOCK_PI || + cmd == FUTEX_WAIT_BITSET || +- cmd == FUTEX_WAIT_REQUEUE_PI)) { ++ cmd == FUTEX_WAIT_REQUEUE_PI || ++ cmd == FUTEX_WAIT_MULTIPLE)) { + if (get_old_timespec32(&ts, utime)) + return -EFAULT; + if (!timespec64_valid(&ts)) + return -EINVAL; + + t = timespec64_to_ktime(ts); +- if (cmd == FUTEX_WAIT) ++ if (cmd == FUTEX_WAIT || cmd == FUTEX_WAIT_MULTIPLE) + t = ktime_add_safe(ktime_get(), t); + tp = &t; + } +-- +2.20.1 diff --git a/linux56-rc-tkg/linux56-tkg-patches/0009-bmq_v5.6-r0.patch b/linux56-rc-tkg/linux56-tkg-patches/0009-bmq_v5.6-r0.patch new file mode 100644 index 0000000..cf7bb1f --- /dev/null +++ b/linux56-rc-tkg/linux56-tkg-patches/0009-bmq_v5.6-r0.patch @@ -0,0 +1,7624 @@ +diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt +index ade4e6ec23e0..80d796db0935 100644 +--- a/Documentation/admin-guide/kernel-parameters.txt ++++ b/Documentation/admin-guide/kernel-parameters.txt +@@ -432,6 +432,11 @@ + embedded devices based on command line input. + See Documentation/block/cmdline-partition.rst + ++ bmq.timeslice= [KNL] Time slice in us for BMQ scheduler. ++ Format: (must be >= 1000) ++ Default: 4000 ++ See Documentation/scheduler/sched-BMQ.txt ++ + boot_delay= Milliseconds to delay each printk during boot. + Values larger than 10 seconds (10000) are changed to + no delay (0). +diff --git a/Documentation/admin-guide/sysctl/kernel.rst b/Documentation/admin-guide/sysctl/kernel.rst +index def074807cee..e4bc9350f192 100644 +--- a/Documentation/admin-guide/sysctl/kernel.rst ++++ b/Documentation/admin-guide/sysctl/kernel.rst +@@ -105,6 +105,7 @@ show up in /proc/sys/kernel: + - unknown_nmi_panic + - watchdog + - watchdog_thresh ++- yield_type + - version + + +@@ -1173,3 +1174,13 @@ is 10 seconds. + + The softlockup threshold is (2 * watchdog_thresh). Setting this + tunable to zero will disable lockup detection altogether. ++ ++yield_type: ++=========== ++ ++BMQ CPU scheduler only. This determines what type of yield calls to ++sched_yield will perform. ++ ++ 0 - No yield. ++ 1 - Deboost and requeue task. (default) ++ 2 - Set run queue skip task. +diff --git a/Documentation/scheduler/sched-BMQ.txt b/Documentation/scheduler/sched-BMQ.txt +new file mode 100644 +index 000000000000..05c84eec0f31 +--- /dev/null ++++ b/Documentation/scheduler/sched-BMQ.txt +@@ -0,0 +1,110 @@ ++ BitMap queue CPU Scheduler ++ -------------------------- ++ ++CONTENT ++======== ++ ++ Background ++ Design ++ Overview ++ Task policy ++ Priority management ++ BitMap Queue ++ CPU Assignment and Migration ++ ++ ++Background ++========== ++ ++BitMap Queue CPU scheduler, referred to as BMQ from here on, is an evolution ++of previous Priority and Deadline based Skiplist multiple queue scheduler(PDS), ++and inspired by Zircon scheduler. The goal of it is to keep the scheduler code ++simple, while efficiency and scalable for interactive tasks, such as desktop, ++movie playback and gaming etc. ++ ++Design ++====== ++ ++Overview ++-------- ++ ++BMQ use per CPU run queue design, each CPU(logical) has it's own run queue, ++each CPU is responsible for scheduling the tasks that are putting into it's ++run queue. ++ ++The run queue is a set of priority queues. Note that these queues are fifo ++queue for non-rt tasks or priority queue for rt tasks in data structure. See ++BitMap Queue below for details. BMQ is optimized for non-rt tasks in the fact ++that most applications are non-rt tasks. No matter the queue is fifo or ++priority, In each queue is an ordered list of runnable tasks awaiting execution ++and the data structures are the same. When it is time for a new task to run, ++the scheduler simply looks the lowest numbered queueue that contains a task, ++and runs the first task from the head of that queue. And per CPU idle task is ++also in the run queue, so the scheduler can always find a task to run on from ++its run queue. ++ ++Each task will assigned the same timeslice(default 4ms) when it is picked to ++start running. Task will be reinserted at the end of the appropriate priority ++queue when it uses its whole timeslice. When the scheduler selects a new task ++from the priority queue it sets the CPU's preemption timer for the remainder of ++the previous timeslice. When that timer fires the scheduler will stop execution ++on that task, select another task and start over again. ++ ++If a task blocks waiting for a shared resource then it's taken out of its ++priority queue and is placed in a wait queue for the shared resource. When it ++is unblocked it will be reinserted in the appropriate priority queue of an ++eligible CPU. ++ ++Task policy ++----------- ++ ++BMQ supports DEADLINE, FIFO, RR, NORMAL, BATCH and IDLE task policy like the ++mainline CFS scheduler. But BMQ is heavy optimized for non-rt task, that's ++NORMAL/BATCH/IDLE policy tasks. Below is the implementation detail of each ++policy. ++ ++DEADLINE ++ It is squashed as priority 0 FIFO task. ++ ++FIFO/RR ++ All RT tasks share one single priority queue in BMQ run queue designed. The ++complexity of insert operation is O(n). BMQ is not designed for system runs ++with major rt policy tasks. ++ ++NORMAL/BATCH/IDLE ++ BATCH and IDLE tasks are treated as the same policy. They compete CPU with ++NORMAL policy tasks, but they just don't boost. To control the priority of ++NORMAL/BATCH/IDLE tasks, simply use nice level. ++ ++ISO ++ ISO policy is not supported in BMQ. Please use nice level -20 NORMAL policy ++task instead. ++ ++Priority management ++------------------- ++ ++RT tasks have priority from 0-99. For non-rt tasks, there are three different ++factors used to determine the effective priority of a task. The effective ++priority being what is used to determine which queue it will be in. ++ ++The first factor is simply the task’s static priority. Which is assigned from ++task's nice level, within [-20, 19] in userland's point of view and [0, 39] ++internally. ++ ++The second factor is the priority boost. This is a value bounded between ++[-MAX_PRIORITY_ADJ, MAX_PRIORITY_ADJ] used to offset the base priority, it is ++modified by the following cases: ++ ++*When a thread has used up its entire timeslice, always deboost its boost by ++increasing by one. ++*When a thread gives up cpu control(voluntary or non-voluntary) to reschedule, ++and its switch-in time(time after last switch and run) below the thredhold ++based on its priority boost, will boost its boost by decreasing by one buti is ++capped at 0 (won’t go negative). ++ ++The intent in this system is to ensure that interactive threads are serviced ++quickly. These are usually the threads that interact directly with the user ++and cause user-perceivable latency. These threads usually do little work and ++spend most of their time blocked awaiting another user event. So they get the ++priority boost from unblocking while background threads that do most of the ++processing receive the priority penalty for using their entire timeslice. +diff --git a/arch/powerpc/platforms/cell/spufs/sched.c b/arch/powerpc/platforms/cell/spufs/sched.c +index f18d5067cd0f..fe489fc01c73 100644 +--- a/arch/powerpc/platforms/cell/spufs/sched.c ++++ b/arch/powerpc/platforms/cell/spufs/sched.c +@@ -51,11 +51,6 @@ static struct task_struct *spusched_task; + static struct timer_list spusched_timer; + static struct timer_list spuloadavg_timer; + +-/* +- * Priority of a normal, non-rt, non-niced'd process (aka nice level 0). +- */ +-#define NORMAL_PRIO 120 +- + /* + * Frequency of the spu scheduler tick. By default we do one SPU scheduler + * tick for every 10 CPU scheduler ticks. +diff --git a/drivers/cpufreq/cpufreq_conservative.c b/drivers/cpufreq/cpufreq_conservative.c +index 737ff3b9c2c0..b5bc5a1b6de7 100644 +--- a/drivers/cpufreq/cpufreq_conservative.c ++++ b/drivers/cpufreq/cpufreq_conservative.c +@@ -28,8 +28,8 @@ struct cs_dbs_tuners { + }; + + /* Conservative governor macros */ +-#define DEF_FREQUENCY_UP_THRESHOLD (80) +-#define DEF_FREQUENCY_DOWN_THRESHOLD (20) ++#define DEF_FREQUENCY_UP_THRESHOLD (63) ++#define DEF_FREQUENCY_DOWN_THRESHOLD (26) + #define DEF_FREQUENCY_STEP (5) + #define DEF_SAMPLING_DOWN_FACTOR (1) + #define MAX_SAMPLING_DOWN_FACTOR (10) +diff --git a/drivers/cpufreq/cpufreq_ondemand.c b/drivers/cpufreq/cpufreq_ondemand.c +index 82a4d37ddecb..1130e0f5db72 100644 +--- a/drivers/cpufreq/cpufreq_ondemand.c ++++ b/drivers/cpufreq/cpufreq_ondemand.c +@@ -18,7 +18,7 @@ + #include "cpufreq_ondemand.h" + + /* On-demand governor macros */ +-#define DEF_FREQUENCY_UP_THRESHOLD (80) ++#define DEF_FREQUENCY_UP_THRESHOLD (63) + #define DEF_SAMPLING_DOWN_FACTOR (1) + #define MAX_SAMPLING_DOWN_FACTOR (100000) + #define MICRO_FREQUENCY_UP_THRESHOLD (95) +@@ -127,7 +127,7 @@ static void dbs_freq_increase(struct cpufreq_policy *policy, unsigned int freq) + } + + /* +- * Every sampling_rate, we check, if current idle time is less than 20% ++ * Every sampling_rate, we check, if current idle time is less than 37% + * (default), then we try to increase frequency. Else, we adjust the frequency + * proportional to load. + */ +diff --git a/fs/proc/base.c b/fs/proc/base.c +index ebea9501afb8..51c9346a69fe 100644 +--- a/fs/proc/base.c ++++ b/fs/proc/base.c +@@ -477,7 +477,7 @@ static int proc_pid_schedstat(struct seq_file *m, struct pid_namespace *ns, + seq_puts(m, "0 0 0\n"); + else + seq_printf(m, "%llu %llu %lu\n", +- (unsigned long long)task->se.sum_exec_runtime, ++ (unsigned long long)tsk_seruntime(task), + (unsigned long long)task->sched_info.run_delay, + task->sched_info.pcount); + +diff --git a/include/asm-generic/resource.h b/include/asm-generic/resource.h +index 8874f681b056..59eb72bf7d5f 100644 +--- a/include/asm-generic/resource.h ++++ b/include/asm-generic/resource.h +@@ -23,7 +23,7 @@ + [RLIMIT_LOCKS] = { RLIM_INFINITY, RLIM_INFINITY }, \ + [RLIMIT_SIGPENDING] = { 0, 0 }, \ + [RLIMIT_MSGQUEUE] = { MQ_BYTES_MAX, MQ_BYTES_MAX }, \ +- [RLIMIT_NICE] = { 0, 0 }, \ ++ [RLIMIT_NICE] = { 30, 30 }, \ + [RLIMIT_RTPRIO] = { 0, 0 }, \ + [RLIMIT_RTTIME] = { RLIM_INFINITY, RLIM_INFINITY }, \ + } +diff --git a/include/linux/jiffies.h b/include/linux/jiffies.h +index 1b6d31da7cbc..dea181bdb1dd 100644 +--- a/include/linux/jiffies.h ++++ b/include/linux/jiffies.h +@@ -171,7 +171,7 @@ static inline u64 get_jiffies_64(void) + * Have the 32 bit jiffies value wrap 5 minutes after boot + * so jiffies wrap bugs show up earlier. + */ +-#define INITIAL_JIFFIES ((unsigned long)(unsigned int) (-300*HZ)) ++#define INITIAL_JIFFIES ((unsigned long)(unsigned int) (-10*HZ)) + + /* + * Change timeval to jiffies, trying to avoid the +diff --git a/include/linux/sched.h b/include/linux/sched.h +index 716ad1d8d95e..9d08ce1d6e6c 100644 +--- a/include/linux/sched.h ++++ b/include/linux/sched.h +@@ -649,13 +649,18 @@ struct task_struct { + unsigned int flags; + unsigned int ptrace; + +-#ifdef CONFIG_SMP ++#if defined(CONFIG_SMP) && !defined(CONFIG_SCHED_BMQ) + struct llist_node wake_entry; ++#endif ++#if defined(CONFIG_SMP) || defined(CONFIG_SCHED_BMQ) + int on_cpu; ++#endif ++#ifdef CONFIG_SMP + #ifdef CONFIG_THREAD_INFO_IN_TASK + /* Current CPU: */ + unsigned int cpu; + #endif ++#ifndef CONFIG_SCHED_BMQ + unsigned int wakee_flips; + unsigned long wakee_flip_decay_ts; + struct task_struct *last_wakee; +@@ -669,6 +674,7 @@ struct task_struct { + */ + int recent_used_cpu; + int wake_cpu; ++#endif /* !CONFIG_SCHED_BMQ */ + #endif + int on_rq; + +@@ -677,13 +683,23 @@ struct task_struct { + int normal_prio; + unsigned int rt_priority; + ++#ifdef CONFIG_SCHED_BMQ ++ u64 last_ran; ++ s64 time_slice; ++ int boost_prio; ++ int bmq_idx; ++ struct list_head bmq_node; ++ /* sched_clock time spent running */ ++ u64 sched_time; ++#else /* !CONFIG_SCHED_BMQ */ + const struct sched_class *sched_class; + struct sched_entity se; + struct sched_rt_entity rt; ++ struct sched_dl_entity dl; ++#endif + #ifdef CONFIG_CGROUP_SCHED + struct task_group *sched_task_group; + #endif +- struct sched_dl_entity dl; + + #ifdef CONFIG_UCLAMP_TASK + /* Clamp values requested for a scheduling entity */ +@@ -1298,6 +1314,15 @@ struct task_struct { + */ + }; + ++#ifdef CONFIG_SCHED_BMQ ++#define tsk_seruntime(t) ((t)->sched_time) ++/* replace the uncertian rt_timeout with 0UL */ ++#define tsk_rttimeout(t) (0UL) ++#else /* CFS */ ++#define tsk_seruntime(t) ((t)->se.sum_exec_runtime) ++#define tsk_rttimeout(t) ((t)->rt.timeout) ++#endif /* !CONFIG_SCHED_BMQ */ ++ + static inline struct pid *task_pid(struct task_struct *task) + { + return task->thread_pid; +diff --git a/include/linux/sched/deadline.h b/include/linux/sched/deadline.h +index 1aff00b65f3c..02a3c5d34ee4 100644 +--- a/include/linux/sched/deadline.h ++++ b/include/linux/sched/deadline.h +@@ -1,5 +1,22 @@ + /* SPDX-License-Identifier: GPL-2.0 */ + ++#ifdef CONFIG_SCHED_BMQ ++ ++#define __tsk_deadline(p) (0UL) ++ ++static inline int dl_prio(int prio) ++{ ++ return 0; ++} ++ ++static inline int dl_task(struct task_struct *p) ++{ ++ return (SCHED_NORMAL == p->policy); ++} ++#else ++ ++#define __tsk_deadline(p) ((p)->dl.deadline) ++ + /* + * SCHED_DEADLINE tasks has negative priorities, reflecting + * the fact that any of them has higher prio than RT and +@@ -19,6 +36,7 @@ static inline int dl_task(struct task_struct *p) + { + return dl_prio(p->prio); + } ++#endif /* CONFIG_SCHED_BMQ */ + + static inline bool dl_time_before(u64 a, u64 b) + { +diff --git a/include/linux/sched/prio.h b/include/linux/sched/prio.h +index 7d64feafc408..d9dc5d3ccd2e 100644 +--- a/include/linux/sched/prio.h ++++ b/include/linux/sched/prio.h +@@ -20,11 +20,17 @@ + */ + + #define MAX_USER_RT_PRIO 100 ++ + #define MAX_RT_PRIO MAX_USER_RT_PRIO + + #define MAX_PRIO (MAX_RT_PRIO + NICE_WIDTH) + #define DEFAULT_PRIO (MAX_RT_PRIO + NICE_WIDTH / 2) + ++#ifdef CONFIG_SCHED_BMQ ++/* +/- priority levels from the base priority */ ++#define MAX_PRIORITY_ADJ 4 ++#endif ++ + /* + * Convert user-nice values [ -20 ... 0 ... 19 ] + * to static priority [ MAX_RT_PRIO..MAX_PRIO-1 ], +diff --git a/include/linux/sched/rt.h b/include/linux/sched/rt.h +index e5af028c08b4..6387c8ea9832 100644 +--- a/include/linux/sched/rt.h ++++ b/include/linux/sched/rt.h +@@ -24,8 +24,10 @@ static inline bool task_is_realtime(struct task_struct *tsk) + + if (policy == SCHED_FIFO || policy == SCHED_RR) + return true; ++#ifndef CONFIG_SCHED_BMQ + if (policy == SCHED_DEADLINE) + return true; ++#endif + return false; + } + +diff --git a/init/Kconfig b/init/Kconfig +index a34064a031a5..256e555538b7 100644 +--- a/init/Kconfig ++++ b/init/Kconfig +@@ -697,9 +697,20 @@ config GENERIC_SCHED_CLOCK + + menu "Scheduler features" + ++config SCHED_BMQ ++ bool "BMQ CPU scheduler" ++ help ++ The BitMap Queue CPU scheduler for excellent interactivity and ++ responsiveness on the desktop and solid scalability on normal ++ hardware and commodity servers. ++ ++ Say Y here. ++ default y ++ + config UCLAMP_TASK + bool "Enable utilization clamping for RT/FAIR tasks" + depends on CPU_FREQ_GOV_SCHEDUTIL ++ depends on !SCHED_BMQ + help + This feature enables the scheduler to track the clamped utilization + of each CPU based on RUNNABLE tasks scheduled on that CPU. +@@ -786,6 +797,7 @@ config NUMA_BALANCING + depends on ARCH_SUPPORTS_NUMA_BALANCING + depends on !ARCH_WANT_NUMA_VARIABLE_LOCALITY + depends on SMP && NUMA && MIGRATION ++ depends on !SCHED_BMQ + help + This option adds support for automatic NUMA aware memory/task placement. + The mechanism is quite primitive and is based on migrating memory when +@@ -887,7 +899,7 @@ menuconfig CGROUP_SCHED + bandwidth allocation to such task groups. It uses cgroups to group + tasks. + +-if CGROUP_SCHED ++if CGROUP_SCHED && !SCHED_BMQ + config FAIR_GROUP_SCHED + bool "Group scheduling for SCHED_OTHER" + depends on CGROUP_SCHED +@@ -1134,6 +1146,7 @@ config CHECKPOINT_RESTORE + + config SCHED_AUTOGROUP + bool "Automatic process group scheduling" ++ depends on !SCHED_BMQ + select CGROUPS + select CGROUP_SCHED + select FAIR_GROUP_SCHED +diff --git a/init/init_task.c b/init/init_task.c +index 9e5cbe5eab7b..c293de91d90f 100644 +--- a/init/init_task.c ++++ b/init/init_task.c +@@ -66,9 +66,15 @@ struct task_struct init_task + .stack = init_stack, + .usage = REFCOUNT_INIT(2), + .flags = PF_KTHREAD, ++#ifdef CONFIG_SCHED_BMQ ++ .prio = DEFAULT_PRIO + MAX_PRIORITY_ADJ, ++ .static_prio = DEFAULT_PRIO, ++ .normal_prio = DEFAULT_PRIO + MAX_PRIORITY_ADJ, ++#else + .prio = MAX_PRIO - 20, + .static_prio = MAX_PRIO - 20, + .normal_prio = MAX_PRIO - 20, ++#endif + .policy = SCHED_NORMAL, + .cpus_ptr = &init_task.cpus_mask, + .cpus_mask = CPU_MASK_ALL, +@@ -78,6 +84,12 @@ struct task_struct init_task + .restart_block = { + .fn = do_no_restart_syscall, + }, ++#ifdef CONFIG_SCHED_BMQ ++ .boost_prio = 0, ++ .bmq_idx = 15, ++ .bmq_node = LIST_HEAD_INIT(init_task.bmq_node), ++ .time_slice = HZ, ++#else + .se = { + .group_node = LIST_HEAD_INIT(init_task.se.group_node), + }, +@@ -85,6 +97,7 @@ struct task_struct init_task + .run_list = LIST_HEAD_INIT(init_task.rt.run_list), + .time_slice = RR_TIMESLICE, + }, ++#endif + .tasks = LIST_HEAD_INIT(init_task.tasks), + #ifdef CONFIG_SMP + .pushable_tasks = PLIST_NODE_INIT(init_task.pushable_tasks, MAX_PRIO), +diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c +index 58f5073acff7..9301e25986d3 100644 +--- a/kernel/cgroup/cpuset.c ++++ b/kernel/cgroup/cpuset.c +@@ -632,7 +632,7 @@ static int validate_change(struct cpuset *cur, struct cpuset *trial) + return ret; + } + +-#ifdef CONFIG_SMP ++#if defined(CONFIG_SMP) && !defined(CONFIG_SCHED_BMQ) + /* + * Helper routine for generate_sched_domains(). + * Do cpusets a, b have overlapping effective cpus_allowed masks? +@@ -1005,7 +1005,7 @@ static void rebuild_sched_domains_locked(void) + /* Have scheduler rebuild the domains */ + partition_and_rebuild_sched_domains(ndoms, doms, attr); + } +-#else /* !CONFIG_SMP */ ++#else /* !CONFIG_SMP || CONFIG_SCHED_BMQ */ + static void rebuild_sched_domains_locked(void) + { + } +diff --git a/kernel/delayacct.c b/kernel/delayacct.c +index 27725754ac99..769d773c7182 100644 +--- a/kernel/delayacct.c ++++ b/kernel/delayacct.c +@@ -106,7 +106,7 @@ int __delayacct_add_tsk(struct taskstats *d, struct task_struct *tsk) + */ + t1 = tsk->sched_info.pcount; + t2 = tsk->sched_info.run_delay; +- t3 = tsk->se.sum_exec_runtime; ++ t3 = tsk_seruntime(tsk); + + d->cpu_count += t1; + +diff --git a/kernel/exit.c b/kernel/exit.c +index 2833ffb0c211..37a1f8d73eee 100644 +--- a/kernel/exit.c ++++ b/kernel/exit.c +@@ -131,7 +131,7 @@ static void __exit_signal(struct task_struct *tsk) + sig->curr_target = next_thread(tsk); + } + +- add_device_randomness((const void*) &tsk->se.sum_exec_runtime, ++ add_device_randomness((const void*) &tsk_seruntime(tsk), + sizeof(unsigned long long)); + + /* +@@ -152,7 +152,7 @@ static void __exit_signal(struct task_struct *tsk) + sig->inblock += task_io_get_inblock(tsk); + sig->oublock += task_io_get_oublock(tsk); + task_io_accounting_add(&sig->ioac, &tsk->ioac); +- sig->sum_sched_runtime += tsk->se.sum_exec_runtime; ++ sig->sum_sched_runtime += tsk_seruntime(tsk); + sig->nr_threads--; + __unhash_process(tsk, group_dead); + write_sequnlock(&sig->stats_lock); +diff --git a/kernel/livepatch/transition.c b/kernel/livepatch/transition.c +index f6310f848f34..3ad290e9fed8 100644 +--- a/kernel/livepatch/transition.c ++++ b/kernel/livepatch/transition.c +@@ -306,7 +306,11 @@ static bool klp_try_switch_task(struct task_struct *task) + */ + rq = task_rq_lock(task, &flags); + ++#ifdef CONFIG_SCHED_BMQ ++ if (task_running(task) && task != current) { ++#else + if (task_running(rq, task) && task != current) { ++#endif + snprintf(err_buf, STACK_ERR_BUF_SIZE, + "%s: %s:%d is running\n", __func__, task->comm, + task->pid); +diff --git a/kernel/locking/rtmutex.c b/kernel/locking/rtmutex.c +index 851bbb10819d..019fdab7e329 100644 +--- a/kernel/locking/rtmutex.c ++++ b/kernel/locking/rtmutex.c +@@ -229,7 +229,7 @@ static inline bool unlock_rt_mutex_safe(struct rt_mutex *lock, + * Only use with rt_mutex_waiter_{less,equal}() + */ + #define task_to_waiter(p) \ +- &(struct rt_mutex_waiter){ .prio = (p)->prio, .deadline = (p)->dl.deadline } ++ &(struct rt_mutex_waiter){ .prio = (p)->prio, .deadline = __tsk_deadline(p) } + + static inline int + rt_mutex_waiter_less(struct rt_mutex_waiter *left, +@@ -680,7 +680,7 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task, + * the values of the node being removed. + */ + waiter->prio = task->prio; +- waiter->deadline = task->dl.deadline; ++ waiter->deadline = __tsk_deadline(task); + + rt_mutex_enqueue(lock, waiter); + +@@ -953,7 +953,7 @@ static int task_blocks_on_rt_mutex(struct rt_mutex *lock, + waiter->task = task; + waiter->lock = lock; + waiter->prio = task->prio; +- waiter->deadline = task->dl.deadline; ++ waiter->deadline = __tsk_deadline(task); + + /* Get the top priority waiter on the lock */ + if (rt_mutex_has_waiters(lock)) +diff --git a/kernel/sched/Makefile b/kernel/sched/Makefile +index 21fb5a5662b5..ac31239aa51a 100644 +--- a/kernel/sched/Makefile ++++ b/kernel/sched/Makefile +@@ -16,14 +16,20 @@ ifneq ($(CONFIG_SCHED_OMIT_FRAME_POINTER),y) + CFLAGS_core.o := $(PROFILING) -fno-omit-frame-pointer + endif + +-obj-y += core.o loadavg.o clock.o cputime.o +-obj-y += idle.o fair.o rt.o deadline.o +-obj-y += wait.o wait_bit.o swait.o completion.o +- +-obj-$(CONFIG_SMP) += cpupri.o cpudeadline.o topology.o stop_task.o pelt.o ++ifdef CONFIG_SCHED_BMQ ++obj-y += bmq.o bmq_debug.o ++else ++obj-y += core.o ++obj-y += fair.o rt.o deadline.o ++obj-$(CONFIG_SMP) += cpudeadline.o stop_task.o + obj-$(CONFIG_SCHED_AUTOGROUP) += autogroup.o +-obj-$(CONFIG_SCHEDSTATS) += stats.o + obj-$(CONFIG_SCHED_DEBUG) += debug.o ++endif ++obj-y += loadavg.o clock.o cputime.o ++obj-y += idle.o ++obj-y += wait.o wait_bit.o swait.o completion.o ++obj-$(CONFIG_SMP) += cpupri.o pelt.o topology.o ++obj-$(CONFIG_SCHEDSTATS) += stats.o + obj-$(CONFIG_CGROUP_CPUACCT) += cpuacct.o + obj-$(CONFIG_CPU_FREQ) += cpufreq.o + obj-$(CONFIG_CPU_FREQ_GOV_SCHEDUTIL) += cpufreq_schedutil.o +diff --git a/kernel/sched/bmq.c b/kernel/sched/bmq.c +new file mode 100644 +index 000000000000..6a5ab93a30bb +--- /dev/null ++++ b/kernel/sched/bmq.c +@@ -0,0 +1,5999 @@ ++/* ++ * kernel/sched/bmq.c ++ * ++ * BMQ Core kernel scheduler code and related syscalls ++ * ++ * Copyright (C) 1991-2002 Linus Torvalds ++ * ++ * 2009-08-13 Brainfuck deadline scheduling policy by Con Kolivas deletes ++ * a whole lot of those previous things. ++ * 2017-09-06 Priority and Deadline based Skip list multiple queue kernel ++ * scheduler by Alfred Chen. ++ * 2019-02-20 BMQ(BitMap Queue) kernel scheduler by Alfred Chen. ++ */ ++#include "bmq_sched.h" ++ ++#include ++ ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++ ++#include ++ ++#include ++ ++#include "../workqueue_internal.h" ++#include "../../fs/io-wq.h" ++#include "../smpboot.h" ++ ++#include "pelt.h" ++ ++#define CREATE_TRACE_POINTS ++#include ++ ++/* rt_prio(prio) defined in include/linux/sched/rt.h */ ++#define rt_task(p) rt_prio((p)->prio) ++#define rt_policy(policy) ((policy) == SCHED_FIFO || (policy) == SCHED_RR) ++#define task_has_rt_policy(p) (rt_policy((p)->policy)) ++ ++#define STOP_PRIO (MAX_RT_PRIO - 1) ++ ++/* Default time slice is 4 in ms, can be set via kernel parameter "bmq.timeslice" */ ++u64 sched_timeslice_ns __read_mostly = (4 * 1000 * 1000); ++ ++static int __init sched_timeslice(char *str) ++{ ++ int timeslice_us; ++ ++ get_option(&str, ×lice_us); ++ if (timeslice_us >= 1000) ++ sched_timeslice_ns = timeslice_us * 1000; ++ ++ return 0; ++} ++early_param("bmq.timeslice", sched_timeslice); ++ ++/* Reschedule if less than this many μs left */ ++#define RESCHED_NS (100 * 1000) ++ ++static inline void print_scheduler_version(void) ++{ ++ printk(KERN_INFO "bmq: BMQ CPU Scheduler 5.5-r3 by Alfred Chen.\n"); ++} ++ ++/** ++ * sched_yield_type - Choose what sort of yield sched_yield will perform. ++ * 0: No yield. ++ * 1: Deboost and requeue task. (default) ++ * 2: Set rq skip task. ++ */ ++int sched_yield_type __read_mostly = 1; ++ ++#define rq_switch_time(rq) ((rq)->clock - (rq)->last_ts_switch) ++#define boost_threshold(p) (sched_timeslice_ns >>\ ++ (10 - MAX_PRIORITY_ADJ - (p)->boost_prio)) ++ ++static inline void boost_task(struct task_struct *p) ++{ ++ int limit; ++ ++ switch (p->policy) { ++ case SCHED_NORMAL: ++ limit = -MAX_PRIORITY_ADJ; ++ break; ++ case SCHED_BATCH: ++ case SCHED_IDLE: ++ limit = 0; ++ break; ++ default: ++ return; ++ } ++ ++ if (p->boost_prio > limit) ++ p->boost_prio--; ++} ++ ++static inline void deboost_task(struct task_struct *p) ++{ ++ if (p->boost_prio < MAX_PRIORITY_ADJ) ++ p->boost_prio++; ++} ++ ++#ifdef CONFIG_SMP ++static cpumask_t sched_rq_pending_mask ____cacheline_aligned_in_smp; ++ ++DEFINE_PER_CPU(cpumask_t [NR_CPU_AFFINITY_CHK_LEVEL], sched_cpu_affinity_masks); ++DEFINE_PER_CPU(cpumask_t *, sched_cpu_affinity_end_mask); ++DEFINE_PER_CPU(cpumask_t *, sched_cpu_llc_mask); ++ ++#ifdef CONFIG_SCHED_SMT ++DEFINE_STATIC_KEY_FALSE(sched_smt_present); ++EXPORT_SYMBOL_GPL(sched_smt_present); ++#endif ++ ++/* ++ * Keep a unique ID per domain (we use the first CPUs number in the cpumask of ++ * the domain), this allows us to quickly tell if two cpus are in the same cache ++ * domain, see cpus_share_cache(). ++ */ ++DEFINE_PER_CPU(int, sd_llc_id); ++#endif /* CONFIG_SMP */ ++ ++static DEFINE_MUTEX(sched_hotcpu_mutex); ++ ++DEFINE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues); ++ ++#ifndef prepare_arch_switch ++# define prepare_arch_switch(next) do { } while (0) ++#endif ++#ifndef finish_arch_post_lock_switch ++# define finish_arch_post_lock_switch() do { } while (0) ++#endif ++ ++#define IDLE_WM (IDLE_TASK_SCHED_PRIO) ++ ++static cpumask_t sched_sg_idle_mask ____cacheline_aligned_in_smp; ++static cpumask_t sched_rq_watermark[bmq_BITS] ____cacheline_aligned_in_smp; ++ ++#if (bmq_BITS <= BITS_PER_LONG) ++#define bmq_find_first_bit(bm) __ffs((bm[0])) ++#define bmq_find_next_bit(bm, start) __ffs(BITMAP_FIRST_WORD_MASK(start) & bm[0]) ++#else ++#define bmq_find_first_bit(bm) find_first_bit((bm), bmq_BITS) ++#define bmq_find_next_bit(bm, start) find_next_bit(bm, bmq_BITS, start) ++#endif ++ ++static inline void update_sched_rq_watermark(struct rq *rq) ++{ ++ unsigned long watermark = bmq_find_first_bit(rq->queue.bitmap); ++ unsigned long last_wm = rq->watermark; ++ unsigned long i; ++ int cpu; ++ ++ if (watermark == last_wm) ++ return; ++ ++ rq->watermark = watermark; ++ cpu = cpu_of(rq); ++ if (watermark < last_wm) { ++ for (i = watermark + 1; i <= last_wm; i++) ++ cpumask_andnot(&sched_rq_watermark[i], ++ &sched_rq_watermark[i], cpumask_of(cpu)); ++#ifdef CONFIG_SCHED_SMT ++ if (!static_branch_likely(&sched_smt_present)) ++ return; ++ if (IDLE_WM == last_wm) ++ cpumask_andnot(&sched_sg_idle_mask, ++ &sched_sg_idle_mask, cpu_smt_mask(cpu)); ++#endif ++ return; ++ } ++ /* last_wm < watermark */ ++ for (i = last_wm + 1; i <= watermark; i++) ++ cpumask_set_cpu(cpu, &sched_rq_watermark[i]); ++#ifdef CONFIG_SCHED_SMT ++ if (!static_branch_likely(&sched_smt_present)) ++ return; ++ if (IDLE_WM == watermark) { ++ cpumask_t tmp; ++ cpumask_and(&tmp, cpu_smt_mask(cpu), &sched_rq_watermark[IDLE_WM]); ++ if (cpumask_equal(&tmp, cpu_smt_mask(cpu))) ++ cpumask_or(&sched_sg_idle_mask, cpu_smt_mask(cpu), ++ &sched_sg_idle_mask); ++ } ++#endif ++} ++ ++static inline int task_sched_prio(struct task_struct *p) ++{ ++ return (p->prio < MAX_RT_PRIO)? 0:p->prio - MAX_RT_PRIO + p->boost_prio + 1; ++} ++ ++static inline void bmq_init(struct bmq *q) ++{ ++ int i; ++ ++ bitmap_zero(q->bitmap, bmq_BITS); ++ for(i = 0; i < bmq_BITS; i++) ++ INIT_LIST_HEAD(&q->heads[i]); ++} ++ ++static inline void bmq_init_idle(struct bmq *q, struct task_struct *idle) ++{ ++ INIT_LIST_HEAD(&q->heads[IDLE_TASK_SCHED_PRIO]); ++ list_add(&idle->bmq_node, &q->heads[IDLE_TASK_SCHED_PRIO]); ++ set_bit(IDLE_TASK_SCHED_PRIO, q->bitmap); ++} ++ ++static inline void bmq_add_task(struct task_struct *p, struct bmq *q, int idx) ++{ ++ struct list_head *n; ++ ++ if (likely(idx)) { ++ list_add_tail(&p->bmq_node, &q->heads[idx]); ++ return; ++ } ++ ++ list_for_each(n, &q->heads[idx]) ++ if (list_entry(n, struct task_struct, bmq_node)->prio > p->prio) ++ break; ++ __list_add(&p->bmq_node, n->prev, n); ++} ++ ++/* ++ * This routine used in bmq scheduler only which assume the idle task in the bmq ++ */ ++static inline struct task_struct *rq_first_bmq_task(struct rq *rq) ++{ ++ unsigned long idx = bmq_find_first_bit(rq->queue.bitmap); ++ const struct list_head *head = &rq->queue.heads[idx]; ++ ++ return list_first_entry(head, struct task_struct, bmq_node); ++} ++ ++static inline struct task_struct * ++rq_next_bmq_task(struct task_struct *p, struct rq *rq) ++{ ++ unsigned long idx = p->bmq_idx; ++ struct list_head *head = &rq->queue.heads[idx]; ++ ++ if (list_is_last(&p->bmq_node, head)) { ++ idx = bmq_find_next_bit(rq->queue.bitmap, idx + 1); ++ head = &rq->queue.heads[idx]; ++ ++ return list_first_entry(head, struct task_struct, bmq_node); ++ } ++ ++ return list_next_entry(p, bmq_node); ++} ++ ++static inline struct task_struct *rq_runnable_task(struct rq *rq) ++{ ++ struct task_struct *next = rq_first_bmq_task(rq); ++ ++ if (unlikely(next == rq->skip)) ++ next = rq_next_bmq_task(next, rq); ++ ++ return next; ++} ++ ++/* ++ * Context: p->pi_lock ++ */ ++static inline struct rq ++*__task_access_lock(struct task_struct *p, raw_spinlock_t **plock) ++{ ++ struct rq *rq; ++ for (;;) { ++ rq = task_rq(p); ++ if (p->on_cpu || task_on_rq_queued(p)) { ++ raw_spin_lock(&rq->lock); ++ if (likely((p->on_cpu || task_on_rq_queued(p)) ++ && rq == task_rq(p))) { ++ *plock = &rq->lock; ++ return rq; ++ } ++ raw_spin_unlock(&rq->lock); ++ } else if (task_on_rq_migrating(p)) { ++ do { ++ cpu_relax(); ++ } while (unlikely(task_on_rq_migrating(p))); ++ } else { ++ *plock = NULL; ++ return rq; ++ } ++ } ++} ++ ++static inline void ++__task_access_unlock(struct task_struct *p, raw_spinlock_t *lock) ++{ ++ if (NULL != lock) ++ raw_spin_unlock(lock); ++} ++ ++static inline struct rq ++*task_access_lock_irqsave(struct task_struct *p, raw_spinlock_t **plock, ++ unsigned long *flags) ++{ ++ struct rq *rq; ++ for (;;) { ++ rq = task_rq(p); ++ if (p->on_cpu || task_on_rq_queued(p)) { ++ raw_spin_lock_irqsave(&rq->lock, *flags); ++ if (likely((p->on_cpu || task_on_rq_queued(p)) ++ && rq == task_rq(p))) { ++ *plock = &rq->lock; ++ return rq; ++ } ++ raw_spin_unlock_irqrestore(&rq->lock, *flags); ++ } else if (task_on_rq_migrating(p)) { ++ do { ++ cpu_relax(); ++ } while (unlikely(task_on_rq_migrating(p))); ++ } else { ++ raw_spin_lock_irqsave(&p->pi_lock, *flags); ++ if (likely(!p->on_cpu && !p->on_rq && ++ rq == task_rq(p))) { ++ *plock = &p->pi_lock; ++ return rq; ++ } ++ raw_spin_unlock_irqrestore(&p->pi_lock, *flags); ++ } ++ } ++} ++ ++static inline void ++task_access_unlock_irqrestore(struct task_struct *p, raw_spinlock_t *lock, ++ unsigned long *flags) ++{ ++ raw_spin_unlock_irqrestore(lock, *flags); ++} ++ ++/* ++ * __task_rq_lock - lock the rq @p resides on. ++ */ ++struct rq *__task_rq_lock(struct task_struct *p, struct rq_flags *rf) ++ __acquires(rq->lock) ++{ ++ struct rq *rq; ++ ++ lockdep_assert_held(&p->pi_lock); ++ ++ for (;;) { ++ rq = task_rq(p); ++ raw_spin_lock(&rq->lock); ++ if (likely(rq == task_rq(p) && !task_on_rq_migrating(p))) ++ return rq; ++ raw_spin_unlock(&rq->lock); ++ ++ while (unlikely(task_on_rq_migrating(p))) ++ cpu_relax(); ++ } ++} ++ ++/* ++ * task_rq_lock - lock p->pi_lock and lock the rq @p resides on. ++ */ ++struct rq *task_rq_lock(struct task_struct *p, struct rq_flags *rf) ++ __acquires(p->pi_lock) ++ __acquires(rq->lock) ++{ ++ struct rq *rq; ++ ++ for (;;) { ++ raw_spin_lock_irqsave(&p->pi_lock, rf->flags); ++ rq = task_rq(p); ++ raw_spin_lock(&rq->lock); ++ /* ++ * move_queued_task() task_rq_lock() ++ * ++ * ACQUIRE (rq->lock) ++ * [S] ->on_rq = MIGRATING [L] rq = task_rq() ++ * WMB (__set_task_cpu()) ACQUIRE (rq->lock); ++ * [S] ->cpu = new_cpu [L] task_rq() ++ * [L] ->on_rq ++ * RELEASE (rq->lock) ++ * ++ * If we observe the old CPU in task_rq_lock(), the acquire of ++ * the old rq->lock will fully serialize against the stores. ++ * ++ * If we observe the new CPU in task_rq_lock(), the address ++ * dependency headed by '[L] rq = task_rq()' and the acquire ++ * will pair with the WMB to ensure we then also see migrating. ++ */ ++ if (likely(rq == task_rq(p) && !task_on_rq_migrating(p))) { ++ return rq; ++ } ++ raw_spin_unlock(&rq->lock); ++ raw_spin_unlock_irqrestore(&p->pi_lock, rf->flags); ++ ++ while (unlikely(task_on_rq_migrating(p))) ++ cpu_relax(); ++ } ++} ++ ++/* ++ * RQ-clock updating methods: ++ */ ++ ++static void update_rq_clock_task(struct rq *rq, s64 delta) ++{ ++/* ++ * In theory, the compile should just see 0 here, and optimize out the call ++ * to sched_rt_avg_update. But I don't trust it... ++ */ ++ s64 __maybe_unused steal = 0, irq_delta = 0; ++ ++#ifdef CONFIG_IRQ_TIME_ACCOUNTING ++ irq_delta = irq_time_read(cpu_of(rq)) - rq->prev_irq_time; ++ ++ /* ++ * Since irq_time is only updated on {soft,}irq_exit, we might run into ++ * this case when a previous update_rq_clock() happened inside a ++ * {soft,}irq region. ++ * ++ * When this happens, we stop ->clock_task and only update the ++ * prev_irq_time stamp to account for the part that fit, so that a next ++ * update will consume the rest. This ensures ->clock_task is ++ * monotonic. ++ * ++ * It does however cause some slight miss-attribution of {soft,}irq ++ * time, a more accurate solution would be to update the irq_time using ++ * the current rq->clock timestamp, except that would require using ++ * atomic ops. ++ */ ++ if (irq_delta > delta) ++ irq_delta = delta; ++ ++ rq->prev_irq_time += irq_delta; ++ delta -= irq_delta; ++#endif ++#ifdef CONFIG_PARAVIRT_TIME_ACCOUNTING ++ if (static_key_false((¶virt_steal_rq_enabled))) { ++ steal = paravirt_steal_clock(cpu_of(rq)); ++ steal -= rq->prev_steal_time_rq; ++ ++ if (unlikely(steal > delta)) ++ steal = delta; ++ ++ rq->prev_steal_time_rq += steal; ++ delta -= steal; ++ } ++#endif ++ ++ rq->clock_task += delta; ++ ++#ifdef CONFIG_HAVE_SCHED_AVG_IRQ ++ if ((irq_delta + steal)) ++ update_irq_load_avg(rq, irq_delta + steal); ++#endif ++} ++ ++static inline void update_rq_clock(struct rq *rq) ++{ ++ s64 delta = sched_clock_cpu(cpu_of(rq)) - rq->clock; ++ ++ if (unlikely(delta <= 0)) ++ return; ++ rq->clock += delta; ++ update_rq_clock_task(rq, delta); ++} ++ ++#ifdef CONFIG_NO_HZ_FULL ++/* ++ * Tick may be needed by tasks in the runqueue depending on their policy and ++ * requirements. If tick is needed, lets send the target an IPI to kick it out ++ * of nohz mode if necessary. ++ */ ++static inline void sched_update_tick_dependency(struct rq *rq) ++{ ++ int cpu; ++ ++ if (!tick_nohz_full_enabled()) ++ return; ++ ++ cpu = cpu_of(rq); ++ ++ if (!tick_nohz_full_cpu(cpu)) ++ return; ++ ++ if (rq->nr_running < 2) ++ tick_nohz_dep_clear_cpu(cpu, TICK_DEP_BIT_SCHED); ++ else ++ tick_nohz_dep_set_cpu(cpu, TICK_DEP_BIT_SCHED); ++} ++#else /* !CONFIG_NO_HZ_FULL */ ++static inline void sched_update_tick_dependency(struct rq *rq) { } ++#endif ++ ++/* ++ * Add/Remove/Requeue task to/from the runqueue routines ++ * Context: rq->lock ++ */ ++static inline void dequeue_task(struct task_struct *p, struct rq *rq, int flags) ++{ ++ lockdep_assert_held(&rq->lock); ++ ++ WARN_ONCE(task_rq(p) != rq, "bmq: dequeue task reside on cpu%d from cpu%d\n", ++ task_cpu(p), cpu_of(rq)); ++ ++ list_del(&p->bmq_node); ++ if (list_empty(&rq->queue.heads[p->bmq_idx])) { ++ clear_bit(p->bmq_idx, rq->queue.bitmap); ++ update_sched_rq_watermark(rq); ++ } ++ --rq->nr_running; ++#ifdef CONFIG_SMP ++ if (1 == rq->nr_running) ++ cpumask_clear_cpu(cpu_of(rq), &sched_rq_pending_mask); ++#endif ++ ++ sched_update_tick_dependency(rq); ++ psi_dequeue(p, flags & DEQUEUE_SLEEP); ++ ++ sched_info_dequeued(rq, p); ++} ++ ++static inline void enqueue_task(struct task_struct *p, struct rq *rq, int flags) ++{ ++ lockdep_assert_held(&rq->lock); ++ ++ WARN_ONCE(task_rq(p) != rq, "bmq: enqueue task reside on cpu%d to cpu%d\n", ++ task_cpu(p), cpu_of(rq)); ++ ++ p->bmq_idx = task_sched_prio(p); ++ bmq_add_task(p, &rq->queue, p->bmq_idx); ++ set_bit(p->bmq_idx, rq->queue.bitmap); ++ update_sched_rq_watermark(rq); ++ ++rq->nr_running; ++#ifdef CONFIG_SMP ++ if (2 == rq->nr_running) ++ cpumask_set_cpu(cpu_of(rq), &sched_rq_pending_mask); ++#endif ++ ++ sched_update_tick_dependency(rq); ++ ++ sched_info_queued(rq, p); ++ psi_enqueue(p, flags); ++ ++ /* ++ * If in_iowait is set, the code below may not trigger any cpufreq ++ * utilization updates, so do it here explicitly with the IOWAIT flag ++ * passed. ++ */ ++ if (p->in_iowait) ++ cpufreq_update_util(rq, SCHED_CPUFREQ_IOWAIT); ++} ++ ++static inline void requeue_task(struct task_struct *p, struct rq *rq) ++{ ++ int idx = task_sched_prio(p); ++ ++ lockdep_assert_held(&rq->lock); ++ WARN_ONCE(task_rq(p) != rq, "bmq: cpu[%d] requeue task reside on cpu%d\n", ++ cpu_of(rq), task_cpu(p)); ++ ++ list_del(&p->bmq_node); ++ bmq_add_task(p, &rq->queue, idx); ++ if (idx != p->bmq_idx) { ++ if (list_empty(&rq->queue.heads[p->bmq_idx])) ++ clear_bit(p->bmq_idx, rq->queue.bitmap); ++ p->bmq_idx = idx; ++ set_bit(p->bmq_idx, rq->queue.bitmap); ++ update_sched_rq_watermark(rq); ++ } ++} ++ ++/* ++ * cmpxchg based fetch_or, macro so it works for different integer types ++ */ ++#define fetch_or(ptr, mask) \ ++ ({ \ ++ typeof(ptr) _ptr = (ptr); \ ++ typeof(mask) _mask = (mask); \ ++ typeof(*_ptr) _old, _val = *_ptr; \ ++ \ ++ for (;;) { \ ++ _old = cmpxchg(_ptr, _val, _val | _mask); \ ++ if (_old == _val) \ ++ break; \ ++ _val = _old; \ ++ } \ ++ _old; \ ++}) ++ ++#if defined(CONFIG_SMP) && defined(TIF_POLLING_NRFLAG) ++/* ++ * Atomically set TIF_NEED_RESCHED and test for TIF_POLLING_NRFLAG, ++ * this avoids any races wrt polling state changes and thereby avoids ++ * spurious IPIs. ++ */ ++static bool set_nr_and_not_polling(struct task_struct *p) ++{ ++ struct thread_info *ti = task_thread_info(p); ++ return !(fetch_or(&ti->flags, _TIF_NEED_RESCHED) & _TIF_POLLING_NRFLAG); ++} ++ ++/* ++ * Atomically set TIF_NEED_RESCHED if TIF_POLLING_NRFLAG is set. ++ * ++ * If this returns true, then the idle task promises to call ++ * sched_ttwu_pending() and reschedule soon. ++ */ ++static bool set_nr_if_polling(struct task_struct *p) ++{ ++ struct thread_info *ti = task_thread_info(p); ++ typeof(ti->flags) old, val = READ_ONCE(ti->flags); ++ ++ for (;;) { ++ if (!(val & _TIF_POLLING_NRFLAG)) ++ return false; ++ if (val & _TIF_NEED_RESCHED) ++ return true; ++ old = cmpxchg(&ti->flags, val, val | _TIF_NEED_RESCHED); ++ if (old == val) ++ break; ++ val = old; ++ } ++ return true; ++} ++ ++#else ++static bool set_nr_and_not_polling(struct task_struct *p) ++{ ++ set_tsk_need_resched(p); ++ return true; ++} ++ ++#ifdef CONFIG_SMP ++static bool set_nr_if_polling(struct task_struct *p) ++{ ++ return false; ++} ++#endif ++#endif ++ ++static bool __wake_q_add(struct wake_q_head *head, struct task_struct *task) ++{ ++ struct wake_q_node *node = &task->wake_q; ++ ++ /* ++ * Atomically grab the task, if ->wake_q is !nil already it means ++ * its already queued (either by us or someone else) and will get the ++ * wakeup due to that. ++ * ++ * In order to ensure that a pending wakeup will observe our pending ++ * state, even in the failed case, an explicit smp_mb() must be used. ++ */ ++ smp_mb__before_atomic(); ++ if (unlikely(cmpxchg_relaxed(&node->next, NULL, WAKE_Q_TAIL))) ++ return false; ++ ++ /* ++ * The head is context local, there can be no concurrency. ++ */ ++ *head->lastp = node; ++ head->lastp = &node->next; ++ return true; ++} ++ ++/** ++ * wake_q_add() - queue a wakeup for 'later' waking. ++ * @head: the wake_q_head to add @task to ++ * @task: the task to queue for 'later' wakeup ++ * ++ * Queue a task for later wakeup, most likely by the wake_up_q() call in the ++ * same context, _HOWEVER_ this is not guaranteed, the wakeup can come ++ * instantly. ++ * ++ * This function must be used as-if it were wake_up_process(); IOW the task ++ * must be ready to be woken at this location. ++ */ ++void wake_q_add(struct wake_q_head *head, struct task_struct *task) ++{ ++ if (__wake_q_add(head, task)) ++ get_task_struct(task); ++} ++ ++/** ++ * wake_q_add_safe() - safely queue a wakeup for 'later' waking. ++ * @head: the wake_q_head to add @task to ++ * @task: the task to queue for 'later' wakeup ++ * ++ * Queue a task for later wakeup, most likely by the wake_up_q() call in the ++ * same context, _HOWEVER_ this is not guaranteed, the wakeup can come ++ * instantly. ++ * ++ * This function must be used as-if it were wake_up_process(); IOW the task ++ * must be ready to be woken at this location. ++ * ++ * This function is essentially a task-safe equivalent to wake_q_add(). Callers ++ * that already hold reference to @task can call the 'safe' version and trust ++ * wake_q to do the right thing depending whether or not the @task is already ++ * queued for wakeup. ++ */ ++void wake_q_add_safe(struct wake_q_head *head, struct task_struct *task) ++{ ++ if (!__wake_q_add(head, task)) ++ put_task_struct(task); ++} ++ ++void wake_up_q(struct wake_q_head *head) ++{ ++ struct wake_q_node *node = head->first; ++ ++ while (node != WAKE_Q_TAIL) { ++ struct task_struct *task; ++ ++ task = container_of(node, struct task_struct, wake_q); ++ BUG_ON(!task); ++ /* task can safely be re-inserted now: */ ++ node = node->next; ++ task->wake_q.next = NULL; ++ ++ /* ++ * wake_up_process() executes a full barrier, which pairs with ++ * the queueing in wake_q_add() so as not to miss wakeups. ++ */ ++ wake_up_process(task); ++ put_task_struct(task); ++ } ++} ++ ++/* ++ * resched_curr - mark rq's current task 'to be rescheduled now'. ++ * ++ * On UP this means the setting of the need_resched flag, on SMP it ++ * might also involve a cross-CPU call to trigger the scheduler on ++ * the target CPU. ++ */ ++void resched_curr(struct rq *rq) ++{ ++ struct task_struct *curr = rq->curr; ++ int cpu; ++ ++ lockdep_assert_held(&rq->lock); ++ ++ if (test_tsk_need_resched(curr)) ++ return; ++ ++ cpu = cpu_of(rq); ++ if (cpu == smp_processor_id()) { ++ set_tsk_need_resched(curr); ++ set_preempt_need_resched(); ++ return; ++ } ++ ++ if (set_nr_and_not_polling(curr)) ++ smp_send_reschedule(cpu); ++ else ++ trace_sched_wake_idle_without_ipi(cpu); ++} ++ ++void resched_cpu(int cpu) ++{ ++ struct rq *rq = cpu_rq(cpu); ++ unsigned long flags; ++ ++ raw_spin_lock_irqsave(&rq->lock, flags); ++ if (cpu_online(cpu) || cpu == smp_processor_id()) ++ resched_curr(cpu_rq(cpu)); ++ raw_spin_unlock_irqrestore(&rq->lock, flags); ++} ++ ++#ifdef CONFIG_SMP ++#ifdef CONFIG_NO_HZ_COMMON ++void nohz_balance_enter_idle(int cpu) ++{ ++} ++ ++void select_nohz_load_balancer(int stop_tick) ++{ ++} ++ ++void set_cpu_sd_state_idle(void) {} ++ ++/* ++ * In the semi idle case, use the nearest busy CPU for migrating timers ++ * from an idle CPU. This is good for power-savings. ++ * ++ * We don't do similar optimization for completely idle system, as ++ * selecting an idle CPU will add more delays to the timers than intended ++ * (as that CPU's timer base may not be uptodate wrt jiffies etc). ++ */ ++int get_nohz_timer_target(void) ++{ ++ int i, cpu = smp_processor_id(); ++ struct cpumask *mask; ++ ++ if (!idle_cpu(cpu) && housekeeping_cpu(cpu, HK_FLAG_TIMER)) ++ return cpu; ++ ++ for (mask = &(per_cpu(sched_cpu_affinity_masks, cpu)[0]); ++ mask < per_cpu(sched_cpu_affinity_end_mask, cpu); mask++) ++ for_each_cpu(i, mask) ++ if (!idle_cpu(i) && housekeeping_cpu(i, HK_FLAG_TIMER)) ++ return i; ++ ++ if (!housekeeping_cpu(cpu, HK_FLAG_TIMER)) ++ cpu = housekeeping_any_cpu(HK_FLAG_TIMER); ++ ++ return cpu; ++} ++ ++/* ++ * When add_timer_on() enqueues a timer into the timer wheel of an ++ * idle CPU then this timer might expire before the next timer event ++ * which is scheduled to wake up that CPU. In case of a completely ++ * idle system the next event might even be infinite time into the ++ * future. wake_up_idle_cpu() ensures that the CPU is woken up and ++ * leaves the inner idle loop so the newly added timer is taken into ++ * account when the CPU goes back to idle and evaluates the timer ++ * wheel for the next timer event. ++ */ ++static inline void wake_up_idle_cpu(int cpu) ++{ ++ if (cpu == smp_processor_id()) ++ return; ++ ++ set_tsk_need_resched(cpu_rq(cpu)->idle); ++ smp_send_reschedule(cpu); ++} ++ ++static inline bool wake_up_full_nohz_cpu(int cpu) ++{ ++ /* ++ * We just need the target to call irq_exit() and re-evaluate ++ * the next tick. The nohz full kick at least implies that. ++ * If needed we can still optimize that later with an ++ * empty IRQ. ++ */ ++ if (tick_nohz_full_cpu(cpu)) { ++ if (cpu != smp_processor_id() || ++ tick_nohz_tick_stopped()) ++ tick_nohz_full_kick_cpu(cpu); ++ return true; ++ } ++ ++ return false; ++} ++ ++void wake_up_nohz_cpu(int cpu) ++{ ++ if (cpu_online(cpu) && !wake_up_full_nohz_cpu(cpu)) ++ wake_up_idle_cpu(cpu); ++} ++ ++#endif /* CONFIG_NO_HZ_COMMON */ ++#endif /* CONFIG_SMP */ ++ ++static inline void check_preempt_curr(struct rq *rq) ++{ ++ if (rq_first_bmq_task(rq) != rq->curr) ++ resched_curr(rq); ++} ++ ++#ifdef CONFIG_SCHED_HRTICK ++/* ++ * Use HR-timers to deliver accurate preemption points. ++ */ ++ ++static void hrtick_clear(struct rq *rq) ++{ ++ if (hrtimer_active(&rq->hrtick_timer)) ++ hrtimer_cancel(&rq->hrtick_timer); ++} ++ ++/* ++ * High-resolution timer tick. ++ * Runs from hardirq context with interrupts disabled. ++ */ ++static enum hrtimer_restart hrtick(struct hrtimer *timer) ++{ ++ struct rq *rq = container_of(timer, struct rq, hrtick_timer); ++ struct task_struct *p; ++ ++ WARN_ON_ONCE(cpu_of(rq) != smp_processor_id()); ++ ++ raw_spin_lock(&rq->lock); ++ p = rq->curr; ++ p->time_slice = 0; ++ resched_curr(rq); ++ raw_spin_unlock(&rq->lock); ++ ++ return HRTIMER_NORESTART; ++} ++ ++/* ++ * Use hrtick when: ++ * - enabled by features ++ * - hrtimer is actually high res ++ */ ++static inline int hrtick_enabled(struct rq *rq) ++{ ++ /** ++ * BMQ doesn't support sched_feat yet ++ if (!sched_feat(HRTICK)) ++ return 0; ++ */ ++ if (!cpu_active(cpu_of(rq))) ++ return 0; ++ return hrtimer_is_hres_active(&rq->hrtick_timer); ++} ++ ++#ifdef CONFIG_SMP ++ ++static void __hrtick_restart(struct rq *rq) ++{ ++ struct hrtimer *timer = &rq->hrtick_timer; ++ ++ hrtimer_start_expires(timer, HRTIMER_MODE_ABS_PINNED_HARD); ++} ++ ++/* ++ * called from hardirq (IPI) context ++ */ ++static void __hrtick_start(void *arg) ++{ ++ struct rq *rq = arg; ++ ++ raw_spin_lock(&rq->lock); ++ __hrtick_restart(rq); ++ rq->hrtick_csd_pending = 0; ++ raw_spin_unlock(&rq->lock); ++} ++ ++/* ++ * Called to set the hrtick timer state. ++ * ++ * called with rq->lock held and irqs disabled ++ */ ++void hrtick_start(struct rq *rq, u64 delay) ++{ ++ struct hrtimer *timer = &rq->hrtick_timer; ++ ktime_t time; ++ s64 delta; ++ ++ /* ++ * Don't schedule slices shorter than 10000ns, that just ++ * doesn't make sense and can cause timer DoS. ++ */ ++ delta = max_t(s64, delay, 10000LL); ++ time = ktime_add_ns(timer->base->get_time(), delta); ++ ++ hrtimer_set_expires(timer, time); ++ ++ if (rq == this_rq()) { ++ __hrtick_restart(rq); ++ } else if (!rq->hrtick_csd_pending) { ++ smp_call_function_single_async(cpu_of(rq), &rq->hrtick_csd); ++ rq->hrtick_csd_pending = 1; ++ } ++} ++ ++#else ++/* ++ * Called to set the hrtick timer state. ++ * ++ * called with rq->lock held and irqs disabled ++ */ ++void hrtick_start(struct rq *rq, u64 delay) ++{ ++ /* ++ * Don't schedule slices shorter than 10000ns, that just ++ * doesn't make sense. Rely on vruntime for fairness. ++ */ ++ delay = max_t(u64, delay, 10000LL); ++ hrtimer_start(&rq->hrtick_timer, ns_to_ktime(delay), ++ HRTIMER_MODE_REL_PINNED_HARD); ++} ++#endif /* CONFIG_SMP */ ++ ++static void hrtick_rq_init(struct rq *rq) ++{ ++#ifdef CONFIG_SMP ++ rq->hrtick_csd_pending = 0; ++ ++ rq->hrtick_csd.flags = 0; ++ rq->hrtick_csd.func = __hrtick_start; ++ rq->hrtick_csd.info = rq; ++#endif ++ ++ hrtimer_init(&rq->hrtick_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL_HARD); ++ rq->hrtick_timer.function = hrtick; ++} ++#else /* CONFIG_SCHED_HRTICK */ ++static inline int hrtick_enabled(struct rq *rq) ++{ ++ return 0; ++} ++ ++static inline void hrtick_clear(struct rq *rq) ++{ ++} ++ ++static inline void hrtick_rq_init(struct rq *rq) ++{ ++} ++#endif /* CONFIG_SCHED_HRTICK */ ++ ++static inline int normal_prio(struct task_struct *p) ++{ ++ if (task_has_rt_policy(p)) ++ return MAX_RT_PRIO - 1 - p->rt_priority; ++ ++ return p->static_prio + MAX_PRIORITY_ADJ; ++} ++ ++/* ++ * Calculate the current priority, i.e. the priority ++ * taken into account by the scheduler. This value might ++ * be boosted by RT tasks as it will be RT if the task got ++ * RT-boosted. If not then it returns p->normal_prio. ++ */ ++static int effective_prio(struct task_struct *p) ++{ ++ p->normal_prio = normal_prio(p); ++ /* ++ * If we are RT tasks or we were boosted to RT priority, ++ * keep the priority unchanged. Otherwise, update priority ++ * to the normal priority: ++ */ ++ if (!rt_prio(p->prio)) ++ return p->normal_prio; ++ return p->prio; ++} ++ ++/* ++ * activate_task - move a task to the runqueue. ++ * ++ * Context: rq->lock ++ */ ++static void activate_task(struct task_struct *p, struct rq *rq) ++{ ++ if (task_contributes_to_load(p)) ++ rq->nr_uninterruptible--; ++ enqueue_task(p, rq, ENQUEUE_WAKEUP); ++ p->on_rq = TASK_ON_RQ_QUEUED; ++ cpufreq_update_util(rq, 0); ++} ++ ++/* ++ * deactivate_task - remove a task from the runqueue. ++ * ++ * Context: rq->lock ++ */ ++static inline void deactivate_task(struct task_struct *p, struct rq *rq) ++{ ++ if (task_contributes_to_load(p)) ++ rq->nr_uninterruptible++; ++ dequeue_task(p, rq, DEQUEUE_SLEEP); ++ p->on_rq = 0; ++ cpufreq_update_util(rq, 0); ++} ++ ++static inline void __set_task_cpu(struct task_struct *p, unsigned int cpu) ++{ ++#ifdef CONFIG_SMP ++ /* ++ * After ->cpu is set up to a new value, task_access_lock(p, ...) can be ++ * successfully executed on another CPU. We must ensure that updates of ++ * per-task data have been completed by this moment. ++ */ ++ smp_wmb(); ++ ++#ifdef CONFIG_THREAD_INFO_IN_TASK ++ WRITE_ONCE(p->cpu, cpu); ++#else ++ WRITE_ONCE(task_thread_info(p)->cpu, cpu); ++#endif ++#endif ++} ++ ++#ifdef CONFIG_SMP ++void set_task_cpu(struct task_struct *p, unsigned int new_cpu) ++{ ++#ifdef CONFIG_SCHED_DEBUG ++ /* ++ * We should never call set_task_cpu() on a blocked task, ++ * ttwu() will sort out the placement. ++ */ ++ WARN_ON_ONCE(p->state != TASK_RUNNING && p->state != TASK_WAKING && ++ !p->on_rq); ++#ifdef CONFIG_LOCKDEP ++ /* ++ * The caller should hold either p->pi_lock or rq->lock, when changing ++ * a task's CPU. ->pi_lock for waking tasks, rq->lock for runnable tasks. ++ * ++ * sched_move_task() holds both and thus holding either pins the cgroup, ++ * see task_group(). ++ */ ++ WARN_ON_ONCE(debug_locks && !(lockdep_is_held(&p->pi_lock) || ++ lockdep_is_held(&task_rq(p)->lock))); ++#endif ++ /* ++ * Clearly, migrating tasks to offline CPUs is a fairly daft thing. ++ */ ++ WARN_ON_ONCE(!cpu_online(new_cpu)); ++#endif ++ if (task_cpu(p) == new_cpu) ++ return; ++ trace_sched_migrate_task(p, new_cpu); ++ rseq_migrate(p); ++ perf_event_task_migrate(p); ++ ++ __set_task_cpu(p, new_cpu); ++} ++ ++static inline bool is_per_cpu_kthread(struct task_struct *p) ++{ ++ return ((p->flags & PF_KTHREAD) && (1 == p->nr_cpus_allowed)); ++} ++ ++/* ++ * Per-CPU kthreads are allowed to run on !active && online CPUs, see ++ * __set_cpus_allowed_ptr() and select_fallback_rq(). ++ */ ++static inline bool is_cpu_allowed(struct task_struct *p, int cpu) ++{ ++ if (!cpumask_test_cpu(cpu, p->cpus_ptr)) ++ return false; ++ ++ if (is_per_cpu_kthread(p)) ++ return cpu_online(cpu); ++ ++ return cpu_active(cpu); ++} ++ ++/* ++ * This is how migration works: ++ * ++ * 1) we invoke migration_cpu_stop() on the target CPU using ++ * stop_one_cpu(). ++ * 2) stopper starts to run (implicitly forcing the migrated thread ++ * off the CPU) ++ * 3) it checks whether the migrated task is still in the wrong runqueue. ++ * 4) if it's in the wrong runqueue then the migration thread removes ++ * it and puts it into the right queue. ++ * 5) stopper completes and stop_one_cpu() returns and the migration ++ * is done. ++ */ ++ ++/* ++ * move_queued_task - move a queued task to new rq. ++ * ++ * Returns (locked) new rq. Old rq's lock is released. ++ */ ++static struct rq *move_queued_task(struct rq *rq, struct task_struct *p, int ++ new_cpu) ++{ ++ lockdep_assert_held(&rq->lock); ++ ++ WRITE_ONCE(p->on_rq, TASK_ON_RQ_MIGRATING); ++ dequeue_task(p, rq, 0); ++ set_task_cpu(p, new_cpu); ++ raw_spin_unlock(&rq->lock); ++ ++ rq = cpu_rq(new_cpu); ++ ++ raw_spin_lock(&rq->lock); ++ BUG_ON(task_cpu(p) != new_cpu); ++ enqueue_task(p, rq, 0); ++ p->on_rq = TASK_ON_RQ_QUEUED; ++ check_preempt_curr(rq); ++ ++ return rq; ++} ++ ++struct migration_arg { ++ struct task_struct *task; ++ int dest_cpu; ++}; ++ ++/* ++ * Move (not current) task off this CPU, onto the destination CPU. We're doing ++ * this because either it can't run here any more (set_cpus_allowed() ++ * away from this CPU, or CPU going down), or because we're ++ * attempting to rebalance this task on exec (sched_exec). ++ * ++ * So we race with normal scheduler movements, but that's OK, as long ++ * as the task is no longer on this CPU. ++ */ ++static struct rq *__migrate_task(struct rq *rq, struct task_struct *p, int ++ dest_cpu) ++{ ++ /* Affinity changed (again). */ ++ if (!is_cpu_allowed(p, dest_cpu)) ++ return rq; ++ ++ update_rq_clock(rq); ++ return move_queued_task(rq, p, dest_cpu); ++} ++ ++/* ++ * migration_cpu_stop - this will be executed by a highprio stopper thread ++ * and performs thread migration by bumping thread off CPU then ++ * 'pushing' onto another runqueue. ++ */ ++static int migration_cpu_stop(void *data) ++{ ++ struct migration_arg *arg = data; ++ struct task_struct *p = arg->task; ++ struct rq *rq = this_rq(); ++ ++ /* ++ * The original target CPU might have gone down and we might ++ * be on another CPU but it doesn't matter. ++ */ ++ local_irq_disable(); ++ ++ raw_spin_lock(&p->pi_lock); ++ raw_spin_lock(&rq->lock); ++ /* ++ * If task_rq(p) != rq, it cannot be migrated here, because we're ++ * holding rq->lock, if p->on_rq == 0 it cannot get enqueued because ++ * we're holding p->pi_lock. ++ */ ++ if (task_rq(p) == rq && task_on_rq_queued(p)) ++ rq = __migrate_task(rq, p, arg->dest_cpu); ++ raw_spin_unlock(&rq->lock); ++ raw_spin_unlock(&p->pi_lock); ++ ++ local_irq_enable(); ++ return 0; ++} ++ ++static inline void ++set_cpus_allowed_common(struct task_struct *p, const struct cpumask *new_mask) ++{ ++ cpumask_copy(&p->cpus_mask, new_mask); ++ p->nr_cpus_allowed = cpumask_weight(new_mask); ++} ++ ++void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask) ++{ ++ set_cpus_allowed_common(p, new_mask); ++} ++#endif ++ ++/** ++ * task_curr - is this task currently executing on a CPU? ++ * @p: the task in question. ++ * ++ * Return: 1 if the task is currently executing. 0 otherwise. ++ */ ++inline int task_curr(const struct task_struct *p) ++{ ++ return cpu_curr(task_cpu(p)) == p; ++} ++ ++#ifdef CONFIG_SMP ++/* ++ * wait_task_inactive - wait for a thread to unschedule. ++ * ++ * If @match_state is nonzero, it's the @p->state value just checked and ++ * not expected to change. If it changes, i.e. @p might have woken up, ++ * then return zero. When we succeed in waiting for @p to be off its CPU, ++ * we return a positive number (its total switch count). If a second call ++ * a short while later returns the same number, the caller can be sure that ++ * @p has remained unscheduled the whole time. ++ * ++ * The caller must ensure that the task *will* unschedule sometime soon, ++ * else this function might spin for a *long* time. This function can't ++ * be called with interrupts off, or it may introduce deadlock with ++ * smp_call_function() if an IPI is sent by the same process we are ++ * waiting to become inactive. ++ */ ++unsigned long wait_task_inactive(struct task_struct *p, long match_state) ++{ ++ unsigned long flags; ++ bool running, on_rq; ++ unsigned long ncsw; ++ struct rq *rq; ++ raw_spinlock_t *lock; ++ ++ for (;;) { ++ rq = task_rq(p); ++ ++ /* ++ * If the task is actively running on another CPU ++ * still, just relax and busy-wait without holding ++ * any locks. ++ * ++ * NOTE! Since we don't hold any locks, it's not ++ * even sure that "rq" stays as the right runqueue! ++ * But we don't care, since this will return false ++ * if the runqueue has changed and p is actually now ++ * running somewhere else! ++ */ ++ while (task_running(p) && p == rq->curr) { ++ if (match_state && unlikely(p->state != match_state)) ++ return 0; ++ cpu_relax(); ++ } ++ ++ /* ++ * Ok, time to look more closely! We need the rq ++ * lock now, to be *sure*. If we're wrong, we'll ++ * just go back and repeat. ++ */ ++ task_access_lock_irqsave(p, &lock, &flags); ++ trace_sched_wait_task(p); ++ running = task_running(p); ++ on_rq = p->on_rq; ++ ncsw = 0; ++ if (!match_state || p->state == match_state) ++ ncsw = p->nvcsw | LONG_MIN; /* sets MSB */ ++ task_access_unlock_irqrestore(p, lock, &flags); ++ ++ /* ++ * If it changed from the expected state, bail out now. ++ */ ++ if (unlikely(!ncsw)) ++ break; ++ ++ /* ++ * Was it really running after all now that we ++ * checked with the proper locks actually held? ++ * ++ * Oops. Go back and try again.. ++ */ ++ if (unlikely(running)) { ++ cpu_relax(); ++ continue; ++ } ++ ++ /* ++ * It's not enough that it's not actively running, ++ * it must be off the runqueue _entirely_, and not ++ * preempted! ++ * ++ * So if it was still runnable (but just not actively ++ * running right now), it's preempted, and we should ++ * yield - it could be a while. ++ */ ++ if (unlikely(on_rq)) { ++ ktime_t to = NSEC_PER_SEC / HZ; ++ ++ set_current_state(TASK_UNINTERRUPTIBLE); ++ schedule_hrtimeout(&to, HRTIMER_MODE_REL); ++ continue; ++ } ++ ++ /* ++ * Ahh, all good. It wasn't running, and it wasn't ++ * runnable, which means that it will never become ++ * running in the future either. We're all done! ++ */ ++ break; ++ } ++ ++ return ncsw; ++} ++ ++/*** ++ * kick_process - kick a running thread to enter/exit the kernel ++ * @p: the to-be-kicked thread ++ * ++ * Cause a process which is running on another CPU to enter ++ * kernel-mode, without any delay. (to get signals handled.) ++ * ++ * NOTE: this function doesn't have to take the runqueue lock, ++ * because all it wants to ensure is that the remote task enters ++ * the kernel. If the IPI races and the task has been migrated ++ * to another CPU then no harm is done and the purpose has been ++ * achieved as well. ++ */ ++void kick_process(struct task_struct *p) ++{ ++ int cpu; ++ ++ preempt_disable(); ++ cpu = task_cpu(p); ++ if ((cpu != smp_processor_id()) && task_curr(p)) ++ smp_send_reschedule(cpu); ++ preempt_enable(); ++} ++EXPORT_SYMBOL_GPL(kick_process); ++ ++/* ++ * ->cpus_ptr is protected by both rq->lock and p->pi_lock ++ * ++ * A few notes on cpu_active vs cpu_online: ++ * ++ * - cpu_active must be a subset of cpu_online ++ * ++ * - on CPU-up we allow per-CPU kthreads on the online && !active CPU, ++ * see __set_cpus_allowed_ptr(). At this point the newly online ++ * CPU isn't yet part of the sched domains, and balancing will not ++ * see it. ++ * ++ * - on cpu-down we clear cpu_active() to mask the sched domains and ++ * avoid the load balancer to place new tasks on the to be removed ++ * CPU. Existing tasks will remain running there and will be taken ++ * off. ++ * ++ * This means that fallback selection must not select !active CPUs. ++ * And can assume that any active CPU must be online. Conversely ++ * select_task_rq() below may allow selection of !active CPUs in order ++ * to satisfy the above rules. ++ */ ++static int select_fallback_rq(int cpu, struct task_struct *p) ++{ ++ int nid = cpu_to_node(cpu); ++ const struct cpumask *nodemask = NULL; ++ enum { cpuset, possible, fail } state = cpuset; ++ int dest_cpu; ++ ++ /* ++ * If the node that the CPU is on has been offlined, cpu_to_node() ++ * will return -1. There is no CPU on the node, and we should ++ * select the CPU on the other node. ++ */ ++ if (nid != -1) { ++ nodemask = cpumask_of_node(nid); ++ ++ /* Look for allowed, online CPU in same node. */ ++ for_each_cpu(dest_cpu, nodemask) { ++ if (!cpu_active(dest_cpu)) ++ continue; ++ if (cpumask_test_cpu(dest_cpu, p->cpus_ptr)) ++ return dest_cpu; ++ } ++ } ++ ++ for (;;) { ++ /* Any allowed, online CPU? */ ++ for_each_cpu(dest_cpu, p->cpus_ptr) { ++ if (!is_cpu_allowed(p, dest_cpu)) ++ continue; ++ goto out; ++ } ++ ++ /* No more Mr. Nice Guy. */ ++ switch (state) { ++ case cpuset: ++ if (IS_ENABLED(CONFIG_CPUSETS)) { ++ cpuset_cpus_allowed_fallback(p); ++ state = possible; ++ break; ++ } ++ /* Fall-through */ ++ case possible: ++ do_set_cpus_allowed(p, cpu_possible_mask); ++ state = fail; ++ break; ++ ++ case fail: ++ BUG(); ++ break; ++ } ++ } ++ ++out: ++ if (state != cpuset) { ++ /* ++ * Don't tell them about moving exiting tasks or ++ * kernel threads (both mm NULL), since they never ++ * leave kernel. ++ */ ++ if (p->mm && printk_ratelimit()) { ++ printk_deferred("process %d (%s) no longer affine to cpu%d\n", ++ task_pid_nr(p), p->comm, cpu); ++ } ++ } ++ ++ return dest_cpu; ++} ++ ++static inline int select_task_rq(struct task_struct *p) ++{ ++ cpumask_t chk_mask, tmp; ++ ++ if (unlikely(!cpumask_and(&chk_mask, p->cpus_ptr, cpu_online_mask))) ++ return select_fallback_rq(task_cpu(p), p); ++ ++ if ( ++#ifdef CONFIG_SCHED_SMT ++ cpumask_and(&tmp, &chk_mask, &sched_sg_idle_mask) || ++#endif ++ cpumask_and(&tmp, &chk_mask, &sched_rq_watermark[IDLE_WM]) || ++ cpumask_and(&tmp, &chk_mask, ++ &sched_rq_watermark[task_sched_prio(p) + 1])) ++ return best_mask_cpu(task_cpu(p), &tmp); ++ ++ return best_mask_cpu(task_cpu(p), &chk_mask); ++} ++ ++void sched_set_stop_task(int cpu, struct task_struct *stop) ++{ ++ struct sched_param stop_param = { .sched_priority = STOP_PRIO }; ++ struct sched_param start_param = { .sched_priority = 0 }; ++ struct task_struct *old_stop = cpu_rq(cpu)->stop; ++ ++ if (stop) { ++ /* ++ * Make it appear like a SCHED_FIFO task, its something ++ * userspace knows about and won't get confused about. ++ * ++ * Also, it will make PI more or less work without too ++ * much confusion -- but then, stop work should not ++ * rely on PI working anyway. ++ */ ++ sched_setscheduler_nocheck(stop, SCHED_FIFO, &stop_param); ++ } ++ ++ cpu_rq(cpu)->stop = stop; ++ ++ if (old_stop) { ++ /* ++ * Reset it back to a normal scheduling policy so that ++ * it can die in pieces. ++ */ ++ sched_setscheduler_nocheck(old_stop, SCHED_NORMAL, &start_param); ++ } ++} ++ ++/* ++ * Change a given task's CPU affinity. Migrate the thread to a ++ * proper CPU and schedule it away if the CPU it's executing on ++ * is removed from the allowed bitmask. ++ * ++ * NOTE: the caller must have a valid reference to the task, the ++ * task must not exit() & deallocate itself prematurely. The ++ * call is not atomic; no spinlocks may be held. ++ */ ++static int __set_cpus_allowed_ptr(struct task_struct *p, ++ const struct cpumask *new_mask, bool check) ++{ ++ const struct cpumask *cpu_valid_mask = cpu_active_mask; ++ int dest_cpu; ++ unsigned long flags; ++ struct rq *rq; ++ raw_spinlock_t *lock; ++ int ret = 0; ++ ++ raw_spin_lock_irqsave(&p->pi_lock, flags); ++ rq = __task_access_lock(p, &lock); ++ ++ if (p->flags & PF_KTHREAD) { ++ /* ++ * Kernel threads are allowed on online && !active CPUs ++ */ ++ cpu_valid_mask = cpu_online_mask; ++ } ++ ++ /* ++ * Must re-check here, to close a race against __kthread_bind(), ++ * sched_setaffinity() is not guaranteed to observe the flag. ++ */ ++ if (check && (p->flags & PF_NO_SETAFFINITY)) { ++ ret = -EINVAL; ++ goto out; ++ } ++ ++ if (cpumask_equal(p->cpus_ptr, new_mask)) ++ goto out; ++ ++ dest_cpu = cpumask_any_and(cpu_valid_mask, new_mask); ++ if (dest_cpu >= nr_cpu_ids) { ++ ret = -EINVAL; ++ goto out; ++ } ++ ++ do_set_cpus_allowed(p, new_mask); ++ ++ if (p->flags & PF_KTHREAD) { ++ /* ++ * For kernel threads that do indeed end up on online && ++ * !active we want to ensure they are strict per-CPU threads. ++ */ ++ WARN_ON(cpumask_intersects(new_mask, cpu_online_mask) && ++ !cpumask_intersects(new_mask, cpu_active_mask) && ++ p->nr_cpus_allowed != 1); ++ } ++ ++ /* Can the task run on the task's current CPU? If so, we're done */ ++ if (cpumask_test_cpu(task_cpu(p), new_mask)) ++ goto out; ++ ++ if (task_running(p) || p->state == TASK_WAKING) { ++ struct migration_arg arg = { p, dest_cpu }; ++ ++ /* Need help from migration thread: drop lock and wait. */ ++ __task_access_unlock(p, lock); ++ raw_spin_unlock_irqrestore(&p->pi_lock, flags); ++ stop_one_cpu(cpu_of(rq), migration_cpu_stop, &arg); ++ return 0; ++ } ++ if (task_on_rq_queued(p)) { ++ /* ++ * OK, since we're going to drop the lock immediately ++ * afterwards anyway. ++ */ ++ update_rq_clock(rq); ++ rq = move_queued_task(rq, p, dest_cpu); ++ lock = &rq->lock; ++ } ++ ++out: ++ __task_access_unlock(p, lock); ++ raw_spin_unlock_irqrestore(&p->pi_lock, flags); ++ ++ return ret; ++} ++ ++int set_cpus_allowed_ptr(struct task_struct *p, const struct cpumask *new_mask) ++{ ++ return __set_cpus_allowed_ptr(p, new_mask, false); ++} ++EXPORT_SYMBOL_GPL(set_cpus_allowed_ptr); ++ ++#else /* CONFIG_SMP */ ++ ++static inline int select_task_rq(struct task_struct *p) ++{ ++ return 0; ++} ++ ++static inline int ++__set_cpus_allowed_ptr(struct task_struct *p, ++ const struct cpumask *new_mask, bool check) ++{ ++ return set_cpus_allowed_ptr(p, new_mask); ++} ++ ++#endif /* CONFIG_SMP */ ++ ++static void ++ttwu_stat(struct task_struct *p, int cpu, int wake_flags) ++{ ++ struct rq *rq; ++ ++ if (!schedstat_enabled()) ++ return; ++ ++ rq= this_rq(); ++ ++#ifdef CONFIG_SMP ++ if (cpu == rq->cpu) ++ __schedstat_inc(rq->ttwu_local); ++ else { ++ /** BMQ ToDo: ++ * How to do ttwu_wake_remote ++ */ ++ } ++#endif /* CONFIG_SMP */ ++ ++ __schedstat_inc(rq->ttwu_count); ++} ++ ++/* ++ * Mark the task runnable and perform wakeup-preemption. ++ */ ++static inline void ++ttwu_do_wakeup(struct rq *rq, struct task_struct *p, int wake_flags) ++{ ++ p->state = TASK_RUNNING; ++ trace_sched_wakeup(p); ++} ++ ++static inline void ++ttwu_do_activate(struct rq *rq, struct task_struct *p, int wake_flags) ++{ ++#ifdef CONFIG_SMP ++ if (p->sched_contributes_to_load) ++ rq->nr_uninterruptible--; ++#endif ++ ++ activate_task(p, rq); ++ ttwu_do_wakeup(rq, p, 0); ++} ++ ++static int ttwu_remote(struct task_struct *p, int wake_flags) ++{ ++ struct rq *rq; ++ raw_spinlock_t *lock; ++ int ret = 0; ++ ++ rq = __task_access_lock(p, &lock); ++ if (task_on_rq_queued(p)) { ++ ttwu_do_wakeup(rq, p, wake_flags); ++ ret = 1; ++ } ++ __task_access_unlock(p, lock); ++ ++ return ret; ++} ++ ++#ifdef CONFIG_SMP ++void scheduler_ipi(void) ++{ ++ /* ++ * Fold TIF_NEED_RESCHED into the preempt_count; anybody setting ++ * TIF_NEED_RESCHED remotely (for the first time) will also send ++ * this IPI. ++ */ ++ preempt_fold_need_resched(); ++ ++ if (!idle_cpu(smp_processor_id()) || need_resched()) ++ return; ++ ++ irq_enter(); ++ irq_exit(); ++} ++ ++void wake_up_if_idle(int cpu) ++{ ++ struct rq *rq = cpu_rq(cpu); ++ unsigned long flags; ++ ++ rcu_read_lock(); ++ ++ if (!is_idle_task(rcu_dereference(rq->curr))) ++ goto out; ++ ++ if (set_nr_if_polling(rq->idle)) { ++ trace_sched_wake_idle_without_ipi(cpu); ++ } else { ++ raw_spin_lock_irqsave(&rq->lock, flags); ++ if (is_idle_task(rq->curr)) ++ smp_send_reschedule(cpu); ++ /* Else CPU is not idle, do nothing here */ ++ raw_spin_unlock_irqrestore(&rq->lock, flags); ++ } ++ ++out: ++ rcu_read_unlock(); ++} ++ ++bool cpus_share_cache(int this_cpu, int that_cpu) ++{ ++ return per_cpu(sd_llc_id, this_cpu) == per_cpu(sd_llc_id, that_cpu); ++} ++#endif /* CONFIG_SMP */ ++ ++static inline void ttwu_queue(struct task_struct *p, int cpu, int wake_flags) ++{ ++ struct rq *rq = cpu_rq(cpu); ++ ++ raw_spin_lock(&rq->lock); ++ update_rq_clock(rq); ++ ttwu_do_activate(rq, p, wake_flags); ++ check_preempt_curr(rq); ++ raw_spin_unlock(&rq->lock); ++} ++ ++/* ++ * Notes on Program-Order guarantees on SMP systems. ++ * ++ * MIGRATION ++ * ++ * The basic program-order guarantee on SMP systems is that when a task [t] ++ * migrates, all its activity on its old CPU [c0] happens-before any subsequent ++ * execution on its new CPU [c1]. ++ * ++ * For migration (of runnable tasks) this is provided by the following means: ++ * ++ * A) UNLOCK of the rq(c0)->lock scheduling out task t ++ * B) migration for t is required to synchronize *both* rq(c0)->lock and ++ * rq(c1)->lock (if not at the same time, then in that order). ++ * C) LOCK of the rq(c1)->lock scheduling in task ++ * ++ * Transitivity guarantees that B happens after A and C after B. ++ * Note: we only require RCpc transitivity. ++ * Note: the CPU doing B need not be c0 or c1 ++ * ++ * Example: ++ * ++ * CPU0 CPU1 CPU2 ++ * ++ * LOCK rq(0)->lock ++ * sched-out X ++ * sched-in Y ++ * UNLOCK rq(0)->lock ++ * ++ * LOCK rq(0)->lock // orders against CPU0 ++ * dequeue X ++ * UNLOCK rq(0)->lock ++ * ++ * LOCK rq(1)->lock ++ * enqueue X ++ * UNLOCK rq(1)->lock ++ * ++ * LOCK rq(1)->lock // orders against CPU2 ++ * sched-out Z ++ * sched-in X ++ * UNLOCK rq(1)->lock ++ * ++ * ++ * BLOCKING -- aka. SLEEP + WAKEUP ++ * ++ * For blocking we (obviously) need to provide the same guarantee as for ++ * migration. However the means are completely different as there is no lock ++ * chain to provide order. Instead we do: ++ * ++ * 1) smp_store_release(X->on_cpu, 0) ++ * 2) smp_cond_load_acquire(!X->on_cpu) ++ * ++ * Example: ++ * ++ * CPU0 (schedule) CPU1 (try_to_wake_up) CPU2 (schedule) ++ * ++ * LOCK rq(0)->lock LOCK X->pi_lock ++ * dequeue X ++ * sched-out X ++ * smp_store_release(X->on_cpu, 0); ++ * ++ * smp_cond_load_acquire(&X->on_cpu, !VAL); ++ * X->state = WAKING ++ * set_task_cpu(X,2) ++ * ++ * LOCK rq(2)->lock ++ * enqueue X ++ * X->state = RUNNING ++ * UNLOCK rq(2)->lock ++ * ++ * LOCK rq(2)->lock // orders against CPU1 ++ * sched-out Z ++ * sched-in X ++ * UNLOCK rq(2)->lock ++ * ++ * UNLOCK X->pi_lock ++ * UNLOCK rq(0)->lock ++ * ++ * ++ * However; for wakeups there is a second guarantee we must provide, namely we ++ * must observe the state that lead to our wakeup. That is, not only must our ++ * task observe its own prior state, it must also observe the stores prior to ++ * its wakeup. ++ * ++ * This means that any means of doing remote wakeups must order the CPU doing ++ * the wakeup against the CPU the task is going to end up running on. This, ++ * however, is already required for the regular Program-Order guarantee above, ++ * since the waking CPU is the one issueing the ACQUIRE (smp_cond_load_acquire). ++ * ++ */ ++ ++/*** ++ * try_to_wake_up - wake up a thread ++ * @p: the thread to be awakened ++ * @state: the mask of task states that can be woken ++ * @wake_flags: wake modifier flags (WF_*) ++ * ++ * Put it on the run-queue if it's not already there. The "current" ++ * thread is always on the run-queue (except when the actual ++ * re-schedule is in progress), and as such you're allowed to do ++ * the simpler "current->state = TASK_RUNNING" to mark yourself ++ * runnable without the overhead of this. ++ * ++ * Return: %true if @p was woken up, %false if it was already running. ++ * or @state didn't match @p's state. ++ */ ++static int try_to_wake_up(struct task_struct *p, unsigned int state, ++ int wake_flags) ++{ ++ unsigned long flags; ++ int cpu, success = 0; ++ ++ preempt_disable(); ++ if (p == current) { ++ /* ++ * We're waking current, this means 'p->on_rq' and 'task_cpu(p) ++ * == smp_processor_id()'. Together this means we can special ++ * case the whole 'p->on_rq && ttwu_remote()' case below ++ * without taking any locks. ++ * ++ * In particular: ++ * - we rely on Program-Order guarantees for all the ordering, ++ * - we're serialized against set_special_state() by virtue of ++ * it disabling IRQs (this allows not taking ->pi_lock). ++ */ ++ if (!(p->state & state)) ++ goto out; ++ ++ success = 1; ++ cpu = task_cpu(p); ++ trace_sched_waking(p); ++ p->state = TASK_RUNNING; ++ trace_sched_wakeup(p); ++ goto out; ++ } ++ ++ /* ++ * If we are going to wake up a thread waiting for CONDITION we ++ * need to ensure that CONDITION=1 done by the caller can not be ++ * reordered with p->state check below. This pairs with mb() in ++ * set_current_state() the waiting thread does. ++ */ ++ raw_spin_lock_irqsave(&p->pi_lock, flags); ++ smp_mb__after_spinlock(); ++ if (!(p->state & state)) ++ goto unlock; ++ ++ trace_sched_waking(p); ++ ++ /* We're going to change ->state: */ ++ success = 1; ++ cpu = task_cpu(p); ++ ++ /* ++ * Ensure we load p->on_rq _after_ p->state, otherwise it would ++ * be possible to, falsely, observe p->on_rq == 0 and get stuck ++ * in smp_cond_load_acquire() below. ++ * ++ * sched_ttwu_pending() try_to_wake_up() ++ * STORE p->on_rq = 1 LOAD p->state ++ * UNLOCK rq->lock ++ * ++ * __schedule() (switch to task 'p') ++ * LOCK rq->lock smp_rmb(); ++ * smp_mb__after_spinlock(); ++ * UNLOCK rq->lock ++ * ++ * [task p] ++ * STORE p->state = UNINTERRUPTIBLE LOAD p->on_rq ++ * ++ * Pairs with the LOCK+smp_mb__after_spinlock() on rq->lock in ++ * __schedule(). See the comment for smp_mb__after_spinlock(). ++ */ ++ smp_rmb(); ++ if (p->on_rq && ttwu_remote(p, wake_flags)) ++ goto unlock; ++ ++#ifdef CONFIG_SMP ++ /* ++ * Ensure we load p->on_cpu _after_ p->on_rq, otherwise it would be ++ * possible to, falsely, observe p->on_cpu == 0. ++ * ++ * One must be running (->on_cpu == 1) in order to remove oneself ++ * from the runqueue. ++ * ++ * __schedule() (switch to task 'p') try_to_wake_up() ++ * STORE p->on_cpu = 1 LOAD p->on_rq ++ * UNLOCK rq->lock ++ * ++ * __schedule() (put 'p' to sleep) ++ * LOCK rq->lock smp_rmb(); ++ * smp_mb__after_spinlock(); ++ * STORE p->on_rq = 0 LOAD p->on_cpu ++ * ++ * Pairs with the LOCK+smp_mb__after_spinlock() on rq->lock in ++ * __schedule(). See the comment for smp_mb__after_spinlock(). ++ */ ++ smp_rmb(); ++ ++ /* ++ * If the owning (remote) CPU is still in the middle of schedule() with ++ * this task as prev, wait until its done referencing the task. ++ * ++ * Pairs with the smp_store_release() in finish_task(). ++ * ++ * This ensures that tasks getting woken will be fully ordered against ++ * their previous state and preserve Program Order. ++ */ ++ smp_cond_load_acquire(&p->on_cpu, !VAL); ++ ++ p->sched_contributes_to_load = !!task_contributes_to_load(p); ++ p->state = TASK_WAKING; ++ ++ if (p->in_iowait) { ++ delayacct_blkio_end(p); ++ atomic_dec(&task_rq(p)->nr_iowait); ++ } ++ ++ if(cpu_rq(smp_processor_id())->clock - p->last_ran > sched_timeslice_ns) ++ boost_task(p); ++ ++ cpu = select_task_rq(p); ++ ++ if (cpu != task_cpu(p)) { ++ wake_flags |= WF_MIGRATED; ++ psi_ttwu_dequeue(p); ++ set_task_cpu(p, cpu); ++ } ++#else /* CONFIG_SMP */ ++ if (p->in_iowait) { ++ delayacct_blkio_end(p); ++ atomic_dec(&task_rq(p)->nr_iowait); ++ } ++#endif /* CONFIG_SMP */ ++ ++ ttwu_queue(p, cpu, wake_flags); ++unlock: ++ raw_spin_unlock_irqrestore(&p->pi_lock, flags); ++out: ++ if (success) ++ ttwu_stat(p, cpu, wake_flags); ++ preempt_enable(); ++ ++ return success; ++} ++ ++/** ++ * wake_up_process - Wake up a specific process ++ * @p: The process to be woken up. ++ * ++ * Attempt to wake up the nominated process and move it to the set of runnable ++ * processes. ++ * ++ * Return: 1 if the process was woken up, 0 if it was already running. ++ * ++ * This function executes a full memory barrier before accessing the task state. ++ */ ++int wake_up_process(struct task_struct *p) ++{ ++ return try_to_wake_up(p, TASK_NORMAL, 0); ++} ++EXPORT_SYMBOL(wake_up_process); ++ ++int wake_up_state(struct task_struct *p, unsigned int state) ++{ ++ return try_to_wake_up(p, state, 0); ++} ++ ++/* ++ * Perform scheduler related setup for a newly forked process p. ++ * p is forked by current. ++ * ++ * __sched_fork() is basic setup used by init_idle() too: ++ */ ++static inline void __sched_fork(unsigned long clone_flags, struct task_struct *p) ++{ ++ p->on_rq = 0; ++ p->on_cpu = 0; ++ p->utime = 0; ++ p->stime = 0; ++ p->sched_time = 0; ++ ++#ifdef CONFIG_PREEMPT_NOTIFIERS ++ INIT_HLIST_HEAD(&p->preempt_notifiers); ++#endif ++ ++#ifdef CONFIG_COMPACTION ++ p->capture_control = NULL; ++#endif ++} ++ ++/* ++ * fork()/clone()-time setup: ++ */ ++int sched_fork(unsigned long clone_flags, struct task_struct *p) ++{ ++ unsigned long flags; ++ int cpu = get_cpu(); ++ struct rq *rq = this_rq(); ++ ++ __sched_fork(clone_flags, p); ++ /* ++ * We mark the process as NEW here. This guarantees that ++ * nobody will actually run it, and a signal or other external ++ * event cannot wake it up and insert it on the runqueue either. ++ */ ++ p->state = TASK_NEW; ++ ++ /* ++ * Make sure we do not leak PI boosting priority to the child. ++ */ ++ p->prio = current->normal_prio; ++ ++ /* ++ * Revert to default priority/policy on fork if requested. ++ */ ++ if (unlikely(p->sched_reset_on_fork)) { ++ if (task_has_rt_policy(p)) { ++ p->policy = SCHED_NORMAL; ++ p->static_prio = NICE_TO_PRIO(0); ++ p->rt_priority = 0; ++ } else if (PRIO_TO_NICE(p->static_prio) < 0) ++ p->static_prio = NICE_TO_PRIO(0); ++ ++ p->prio = p->normal_prio = normal_prio(p); ++ ++ /* ++ * We don't need the reset flag anymore after the fork. It has ++ * fulfilled its duty: ++ */ ++ p->sched_reset_on_fork = 0; ++ } ++ ++ p->boost_prio = (p->boost_prio < 0) ? ++ p->boost_prio + MAX_PRIORITY_ADJ : MAX_PRIORITY_ADJ; ++ /* ++ * Share the timeslice between parent and child, thus the ++ * total amount of pending timeslices in the system doesn't change, ++ * resulting in more scheduling fairness. ++ */ ++ raw_spin_lock_irqsave(&rq->lock, flags); ++ rq->curr->time_slice /= 2; ++ p->time_slice = rq->curr->time_slice; ++#ifdef CONFIG_SCHED_HRTICK ++ hrtick_start(rq, rq->curr->time_slice); ++#endif ++ ++ if (p->time_slice < RESCHED_NS) { ++ p->time_slice = sched_timeslice_ns; ++ resched_curr(rq); ++ } ++ raw_spin_unlock_irqrestore(&rq->lock, flags); ++ ++ /* ++ * The child is not yet in the pid-hash so no cgroup attach races, ++ * and the cgroup is pinned to this child due to cgroup_fork() ++ * is ran before sched_fork(). ++ * ++ * Silence PROVE_RCU. ++ */ ++ raw_spin_lock_irqsave(&p->pi_lock, flags); ++ /* ++ * We're setting the CPU for the first time, we don't migrate, ++ * so use __set_task_cpu(). ++ */ ++ __set_task_cpu(p, cpu); ++ raw_spin_unlock_irqrestore(&p->pi_lock, flags); ++ ++#ifdef CONFIG_SCHED_INFO ++ if (unlikely(sched_info_on())) ++ memset(&p->sched_info, 0, sizeof(p->sched_info)); ++#endif ++ init_task_preempt_count(p); ++ ++ put_cpu(); ++ return 0; ++} ++ ++#ifdef CONFIG_SCHEDSTATS ++ ++DEFINE_STATIC_KEY_FALSE(sched_schedstats); ++static bool __initdata __sched_schedstats = false; ++ ++static void set_schedstats(bool enabled) ++{ ++ if (enabled) ++ static_branch_enable(&sched_schedstats); ++ else ++ static_branch_disable(&sched_schedstats); ++} ++ ++void force_schedstat_enabled(void) ++{ ++ if (!schedstat_enabled()) { ++ pr_info("kernel profiling enabled schedstats, disable via kernel.sched_schedstats.\n"); ++ static_branch_enable(&sched_schedstats); ++ } ++} ++ ++static int __init setup_schedstats(char *str) ++{ ++ int ret = 0; ++ if (!str) ++ goto out; ++ ++ /* ++ * This code is called before jump labels have been set up, so we can't ++ * change the static branch directly just yet. Instead set a temporary ++ * variable so init_schedstats() can do it later. ++ */ ++ if (!strcmp(str, "enable")) { ++ __sched_schedstats = true; ++ ret = 1; ++ } else if (!strcmp(str, "disable")) { ++ __sched_schedstats = false; ++ ret = 1; ++ } ++out: ++ if (!ret) ++ pr_warn("Unable to parse schedstats=\n"); ++ ++ return ret; ++} ++__setup("schedstats=", setup_schedstats); ++ ++static void __init init_schedstats(void) ++{ ++ set_schedstats(__sched_schedstats); ++} ++ ++#ifdef CONFIG_PROC_SYSCTL ++int sysctl_schedstats(struct ctl_table *table, int write, ++ void __user *buffer, size_t *lenp, loff_t *ppos) ++{ ++ struct ctl_table t; ++ int err; ++ int state = static_branch_likely(&sched_schedstats); ++ ++ if (write && !capable(CAP_SYS_ADMIN)) ++ return -EPERM; ++ ++ t = *table; ++ t.data = &state; ++ err = proc_dointvec_minmax(&t, write, buffer, lenp, ppos); ++ if (err < 0) ++ return err; ++ if (write) ++ set_schedstats(state); ++ return err; ++} ++#endif /* CONFIG_PROC_SYSCTL */ ++#else /* !CONFIG_SCHEDSTATS */ ++static inline void init_schedstats(void) {} ++#endif /* CONFIG_SCHEDSTATS */ ++ ++/* ++ * wake_up_new_task - wake up a newly created task for the first time. ++ * ++ * This function will do some initial scheduler statistics housekeeping ++ * that must be done for every newly created context, then puts the task ++ * on the runqueue and wakes it. ++ */ ++void wake_up_new_task(struct task_struct *p) ++{ ++ unsigned long flags; ++ struct rq *rq; ++ ++ raw_spin_lock_irqsave(&p->pi_lock, flags); ++ ++ p->state = TASK_RUNNING; ++ ++ rq = cpu_rq(select_task_rq(p)); ++#ifdef CONFIG_SMP ++ /* ++ * Fork balancing, do it here and not earlier because: ++ * - cpus_ptr can change in the fork path ++ * - any previously selected CPU might disappear through hotplug ++ * Use __set_task_cpu() to avoid calling sched_class::migrate_task_rq, ++ * as we're not fully set-up yet. ++ */ ++ __set_task_cpu(p, cpu_of(rq)); ++#endif ++ ++ raw_spin_lock(&rq->lock); ++ ++ update_rq_clock(rq); ++ activate_task(p, rq); ++ trace_sched_wakeup_new(p); ++ check_preempt_curr(rq); ++ ++ raw_spin_unlock(&rq->lock); ++ raw_spin_unlock_irqrestore(&p->pi_lock, flags); ++} ++ ++#ifdef CONFIG_PREEMPT_NOTIFIERS ++ ++static DEFINE_STATIC_KEY_FALSE(preempt_notifier_key); ++ ++void preempt_notifier_inc(void) ++{ ++ static_branch_inc(&preempt_notifier_key); ++} ++EXPORT_SYMBOL_GPL(preempt_notifier_inc); ++ ++void preempt_notifier_dec(void) ++{ ++ static_branch_dec(&preempt_notifier_key); ++} ++EXPORT_SYMBOL_GPL(preempt_notifier_dec); ++ ++/** ++ * preempt_notifier_register - tell me when current is being preempted & rescheduled ++ * @notifier: notifier struct to register ++ */ ++void preempt_notifier_register(struct preempt_notifier *notifier) ++{ ++ if (!static_branch_unlikely(&preempt_notifier_key)) ++ WARN(1, "registering preempt_notifier while notifiers disabled\n"); ++ ++ hlist_add_head(¬ifier->link, ¤t->preempt_notifiers); ++} ++EXPORT_SYMBOL_GPL(preempt_notifier_register); ++ ++/** ++ * preempt_notifier_unregister - no longer interested in preemption notifications ++ * @notifier: notifier struct to unregister ++ * ++ * This is *not* safe to call from within a preemption notifier. ++ */ ++void preempt_notifier_unregister(struct preempt_notifier *notifier) ++{ ++ hlist_del(¬ifier->link); ++} ++EXPORT_SYMBOL_GPL(preempt_notifier_unregister); ++ ++static void __fire_sched_in_preempt_notifiers(struct task_struct *curr) ++{ ++ struct preempt_notifier *notifier; ++ ++ hlist_for_each_entry(notifier, &curr->preempt_notifiers, link) ++ notifier->ops->sched_in(notifier, raw_smp_processor_id()); ++} ++ ++static __always_inline void fire_sched_in_preempt_notifiers(struct task_struct *curr) ++{ ++ if (static_branch_unlikely(&preempt_notifier_key)) ++ __fire_sched_in_preempt_notifiers(curr); ++} ++ ++static void ++__fire_sched_out_preempt_notifiers(struct task_struct *curr, ++ struct task_struct *next) ++{ ++ struct preempt_notifier *notifier; ++ ++ hlist_for_each_entry(notifier, &curr->preempt_notifiers, link) ++ notifier->ops->sched_out(notifier, next); ++} ++ ++static __always_inline void ++fire_sched_out_preempt_notifiers(struct task_struct *curr, ++ struct task_struct *next) ++{ ++ if (static_branch_unlikely(&preempt_notifier_key)) ++ __fire_sched_out_preempt_notifiers(curr, next); ++} ++ ++#else /* !CONFIG_PREEMPT_NOTIFIERS */ ++ ++static inline void fire_sched_in_preempt_notifiers(struct task_struct *curr) ++{ ++} ++ ++static inline void ++fire_sched_out_preempt_notifiers(struct task_struct *curr, ++ struct task_struct *next) ++{ ++} ++ ++#endif /* CONFIG_PREEMPT_NOTIFIERS */ ++ ++static inline void prepare_task(struct task_struct *next) ++{ ++ /* ++ * Claim the task as running, we do this before switching to it ++ * such that any running task will have this set. ++ */ ++ next->on_cpu = 1; ++} ++ ++static inline void finish_task(struct task_struct *prev) ++{ ++#ifdef CONFIG_SMP ++ /* ++ * After ->on_cpu is cleared, the task can be moved to a different CPU. ++ * We must ensure this doesn't happen until the switch is completely ++ * finished. ++ * ++ * In particular, the load of prev->state in finish_task_switch() must ++ * happen before this. ++ * ++ * Pairs with the smp_cond_load_acquire() in try_to_wake_up(). ++ */ ++ smp_store_release(&prev->on_cpu, 0); ++#else ++ prev->on_cpu = 0; ++#endif ++} ++ ++static inline void ++prepare_lock_switch(struct rq *rq, struct task_struct *next) ++{ ++ /* ++ * Since the runqueue lock will be released by the next ++ * task (which is an invalid locking op but in the case ++ * of the scheduler it's an obvious special-case), so we ++ * do an early lockdep release here: ++ */ ++ spin_release(&rq->lock.dep_map, _THIS_IP_); ++#ifdef CONFIG_DEBUG_SPINLOCK ++ /* this is a valid case when another task releases the spinlock */ ++ rq->lock.owner = next; ++#endif ++} ++ ++static inline void finish_lock_switch(struct rq *rq) ++{ ++ /* ++ * If we are tracking spinlock dependencies then we have to ++ * fix up the runqueue lock - which gets 'carried over' from ++ * prev into current: ++ */ ++ spin_acquire(&rq->lock.dep_map, 0, 0, _THIS_IP_); ++ raw_spin_unlock_irq(&rq->lock); ++} ++ ++/** ++ * prepare_task_switch - prepare to switch tasks ++ * @rq: the runqueue preparing to switch ++ * @next: the task we are going to switch to. ++ * ++ * This is called with the rq lock held and interrupts off. It must ++ * be paired with a subsequent finish_task_switch after the context ++ * switch. ++ * ++ * prepare_task_switch sets up locking and calls architecture specific ++ * hooks. ++ */ ++static inline void ++prepare_task_switch(struct rq *rq, struct task_struct *prev, ++ struct task_struct *next) ++{ ++ kcov_prepare_switch(prev); ++ sched_info_switch(rq, prev, next); ++ perf_event_task_sched_out(prev, next); ++ rseq_preempt(prev); ++ fire_sched_out_preempt_notifiers(prev, next); ++ prepare_task(next); ++ prepare_arch_switch(next); ++} ++ ++/** ++ * finish_task_switch - clean up after a task-switch ++ * @rq: runqueue associated with task-switch ++ * @prev: the thread we just switched away from. ++ * ++ * finish_task_switch must be called after the context switch, paired ++ * with a prepare_task_switch call before the context switch. ++ * finish_task_switch will reconcile locking set up by prepare_task_switch, ++ * and do any other architecture-specific cleanup actions. ++ * ++ * Note that we may have delayed dropping an mm in context_switch(). If ++ * so, we finish that here outside of the runqueue lock. (Doing it ++ * with the lock held can cause deadlocks; see schedule() for ++ * details.) ++ * ++ * The context switch have flipped the stack from under us and restored the ++ * local variables which were saved when this task called schedule() in the ++ * past. prev == current is still correct but we need to recalculate this_rq ++ * because prev may have moved to another CPU. ++ */ ++static struct rq *finish_task_switch(struct task_struct *prev) ++ __releases(rq->lock) ++{ ++ struct rq *rq = this_rq(); ++ struct mm_struct *mm = rq->prev_mm; ++ long prev_state; ++ ++ /* ++ * The previous task will have left us with a preempt_count of 2 ++ * because it left us after: ++ * ++ * schedule() ++ * preempt_disable(); // 1 ++ * __schedule() ++ * raw_spin_lock_irq(&rq->lock) // 2 ++ * ++ * Also, see FORK_PREEMPT_COUNT. ++ */ ++ if (WARN_ONCE(preempt_count() != 2*PREEMPT_DISABLE_OFFSET, ++ "corrupted preempt_count: %s/%d/0x%x\n", ++ current->comm, current->pid, preempt_count())) ++ preempt_count_set(FORK_PREEMPT_COUNT); ++ ++ rq->prev_mm = NULL; ++ ++ /* ++ * A task struct has one reference for the use as "current". ++ * If a task dies, then it sets TASK_DEAD in tsk->state and calls ++ * schedule one last time. The schedule call will never return, and ++ * the scheduled task must drop that reference. ++ * ++ * We must observe prev->state before clearing prev->on_cpu (in ++ * finish_task), otherwise a concurrent wakeup can get prev ++ * running on another CPU and we could rave with its RUNNING -> DEAD ++ * transition, resulting in a double drop. ++ */ ++ prev_state = prev->state; ++ vtime_task_switch(prev); ++ perf_event_task_sched_in(prev, current); ++ finish_task(prev); ++ finish_lock_switch(rq); ++ finish_arch_post_lock_switch(); ++ kcov_finish_switch(current); ++ ++ fire_sched_in_preempt_notifiers(current); ++ /* ++ * When switching through a kernel thread, the loop in ++ * membarrier_{private,global}_expedited() may have observed that ++ * kernel thread and not issued an IPI. It is therefore possible to ++ * schedule between user->kernel->user threads without passing though ++ * switch_mm(). Membarrier requires a barrier after storing to ++ * rq->curr, before returning to userspace, so provide them here: ++ * ++ * - a full memory barrier for {PRIVATE,GLOBAL}_EXPEDITED, implicitly ++ * provided by mmdrop(), ++ * - a sync_core for SYNC_CORE. ++ */ ++ if (mm) { ++ membarrier_mm_sync_core_before_usermode(mm); ++ mmdrop(mm); ++ } ++ if (unlikely(prev_state == TASK_DEAD)) { ++ /* ++ * Remove function-return probe instances associated with this ++ * task and put them back on the free list. ++ */ ++ kprobe_flush_task(prev); ++ ++ /* Task is done with its stack. */ ++ put_task_stack(prev); ++ ++ put_task_struct_rcu_user(prev); ++ } ++ ++ tick_nohz_task_switch(); ++ return rq; ++} ++ ++/** ++ * schedule_tail - first thing a freshly forked thread must call. ++ * @prev: the thread we just switched away from. ++ */ ++asmlinkage __visible void schedule_tail(struct task_struct *prev) ++ __releases(rq->lock) ++{ ++ struct rq *rq; ++ ++ /* ++ * New tasks start with FORK_PREEMPT_COUNT, see there and ++ * finish_task_switch() for details. ++ * ++ * finish_task_switch() will drop rq->lock() and lower preempt_count ++ * and the preempt_enable() will end up enabling preemption (on ++ * PREEMPT_COUNT kernels). ++ */ ++ ++ rq = finish_task_switch(prev); ++ preempt_enable(); ++ ++ if (current->set_child_tid) ++ put_user(task_pid_vnr(current), current->set_child_tid); ++ ++ calculate_sigpending(); ++} ++ ++/* ++ * context_switch - switch to the new MM and the new thread's register state. ++ */ ++static __always_inline struct rq * ++context_switch(struct rq *rq, struct task_struct *prev, ++ struct task_struct *next) ++{ ++ prepare_task_switch(rq, prev, next); ++ ++ /* ++ * For paravirt, this is coupled with an exit in switch_to to ++ * combine the page table reload and the switch backend into ++ * one hypercall. ++ */ ++ arch_start_context_switch(prev); ++ ++ /* ++ * kernel -> kernel lazy + transfer active ++ * user -> kernel lazy + mmgrab() active ++ * ++ * kernel -> user switch + mmdrop() active ++ * user -> user switch ++ */ ++ if (!next->mm) { // to kernel ++ enter_lazy_tlb(prev->active_mm, next); ++ ++ next->active_mm = prev->active_mm; ++ if (prev->mm) // from user ++ mmgrab(prev->active_mm); ++ else ++ prev->active_mm = NULL; ++ } else { // to user ++ membarrier_switch_mm(rq, prev->active_mm, next->mm); ++ /* ++ * sys_membarrier() requires an smp_mb() between setting ++ * rq->curr / membarrier_switch_mm() and returning to userspace. ++ * ++ * The below provides this either through switch_mm(), or in ++ * case 'prev->active_mm == next->mm' through ++ * finish_task_switch()'s mmdrop(). ++ */ ++ switch_mm_irqs_off(prev->active_mm, next->mm, next); ++ ++ if (!prev->mm) { // from kernel ++ /* will mmdrop() in finish_task_switch(). */ ++ rq->prev_mm = prev->active_mm; ++ prev->active_mm = NULL; ++ } ++ } ++ ++ prepare_lock_switch(rq, next); ++ ++ /* Here we just switch the register state and the stack. */ ++ switch_to(prev, next, prev); ++ barrier(); ++ ++ return finish_task_switch(prev); ++} ++ ++/* ++ * nr_running, nr_uninterruptible and nr_context_switches: ++ * ++ * externally visible scheduler statistics: current number of runnable ++ * threads, total number of context switches performed since bootup. ++ */ ++unsigned long nr_running(void) ++{ ++ unsigned long i, sum = 0; ++ ++ for_each_online_cpu(i) ++ sum += cpu_rq(i)->nr_running; ++ ++ return sum; ++} ++ ++/* ++ * Check if only the current task is running on the CPU. ++ * ++ * Caution: this function does not check that the caller has disabled ++ * preemption, thus the result might have a time-of-check-to-time-of-use ++ * race. The caller is responsible to use it correctly, for example: ++ * ++ * - from a non-preemptible section (of course) ++ * ++ * - from a thread that is bound to a single CPU ++ * ++ * - in a loop with very short iterations (e.g. a polling loop) ++ */ ++bool single_task_running(void) ++{ ++ return raw_rq()->nr_running == 1; ++} ++EXPORT_SYMBOL(single_task_running); ++ ++unsigned long long nr_context_switches(void) ++{ ++ int i; ++ unsigned long long sum = 0; ++ ++ for_each_possible_cpu(i) ++ sum += cpu_rq(i)->nr_switches; ++ ++ return sum; ++} ++ ++/* ++ * Consumers of these two interfaces, like for example the cpuidle menu ++ * governor, are using nonsensical data. Preferring shallow idle state selection ++ * for a CPU that has IO-wait which might not even end up running the task when ++ * it does become runnable. ++ */ ++ ++unsigned long nr_iowait_cpu(int cpu) ++{ ++ return atomic_read(&cpu_rq(cpu)->nr_iowait); ++} ++ ++/* ++ * IO-wait accounting, and how its mostly bollocks (on SMP). ++ * ++ * The idea behind IO-wait account is to account the idle time that we could ++ * have spend running if it were not for IO. That is, if we were to improve the ++ * storage performance, we'd have a proportional reduction in IO-wait time. ++ * ++ * This all works nicely on UP, where, when a task blocks on IO, we account ++ * idle time as IO-wait, because if the storage were faster, it could've been ++ * running and we'd not be idle. ++ * ++ * This has been extended to SMP, by doing the same for each CPU. This however ++ * is broken. ++ * ++ * Imagine for instance the case where two tasks block on one CPU, only the one ++ * CPU will have IO-wait accounted, while the other has regular idle. Even ++ * though, if the storage were faster, both could've ran at the same time, ++ * utilising both CPUs. ++ * ++ * This means, that when looking globally, the current IO-wait accounting on ++ * SMP is a lower bound, by reason of under accounting. ++ * ++ * Worse, since the numbers are provided per CPU, they are sometimes ++ * interpreted per CPU, and that is nonsensical. A blocked task isn't strictly ++ * associated with any one particular CPU, it can wake to another CPU than it ++ * blocked on. This means the per CPU IO-wait number is meaningless. ++ * ++ * Task CPU affinities can make all that even more 'interesting'. ++ */ ++ ++unsigned long nr_iowait(void) ++{ ++ unsigned long i, sum = 0; ++ ++ for_each_possible_cpu(i) ++ sum += nr_iowait_cpu(i); ++ ++ return sum; ++} ++ ++#ifdef CONFIG_SMP ++ ++/* ++ * sched_exec - execve() is a valuable balancing opportunity, because at ++ * this point the task has the smallest effective memory and cache ++ * footprint. ++ */ ++void sched_exec(void) ++{ ++ struct task_struct *p = current; ++ int dest_cpu; ++ ++ if (task_rq(p)->nr_running < 2) ++ return; ++ ++ dest_cpu = cpumask_any_and(p->cpus_ptr, &sched_rq_watermark[IDLE_WM]); ++ if ( dest_cpu < nr_cpu_ids) { ++#ifdef CONFIG_SCHED_SMT ++ int smt = cpumask_any_and(p->cpus_ptr, &sched_sg_idle_mask); ++ if (smt < nr_cpu_ids) ++ dest_cpu = smt; ++#endif ++ if (likely(cpu_active(dest_cpu))) { ++ struct migration_arg arg = { p, dest_cpu }; ++ ++ stop_one_cpu(task_cpu(p), migration_cpu_stop, &arg); ++ return; ++ } ++ } ++} ++ ++#endif ++ ++DEFINE_PER_CPU(struct kernel_stat, kstat); ++DEFINE_PER_CPU(struct kernel_cpustat, kernel_cpustat); ++ ++EXPORT_PER_CPU_SYMBOL(kstat); ++EXPORT_PER_CPU_SYMBOL(kernel_cpustat); ++ ++static inline void update_curr(struct rq *rq, struct task_struct *p) ++{ ++ s64 ns = rq->clock_task - p->last_ran; ++ ++ p->sched_time += ns; ++ account_group_exec_runtime(p, ns); ++ ++ p->time_slice -= ns; ++ p->last_ran = rq->clock_task; ++} ++ ++/* ++ * Return accounted runtime for the task. ++ * Return separately the current's pending runtime that have not been ++ * accounted yet. ++ */ ++unsigned long long task_sched_runtime(struct task_struct *p) ++{ ++ unsigned long flags; ++ struct rq *rq; ++ raw_spinlock_t *lock; ++ u64 ns; ++ ++#if defined(CONFIG_64BIT) && defined(CONFIG_SMP) ++ /* ++ * 64-bit doesn't need locks to atomically read a 64-bit value. ++ * So we have a optimization chance when the task's delta_exec is 0. ++ * Reading ->on_cpu is racy, but this is ok. ++ * ++ * If we race with it leaving CPU, we'll take a lock. So we're correct. ++ * If we race with it entering CPU, unaccounted time is 0. This is ++ * indistinguishable from the read occurring a few cycles earlier. ++ * If we see ->on_cpu without ->on_rq, the task is leaving, and has ++ * been accounted, so we're correct here as well. ++ */ ++ if (!p->on_cpu || !task_on_rq_queued(p)) ++ return tsk_seruntime(p); ++#endif ++ ++ rq = task_access_lock_irqsave(p, &lock, &flags); ++ /* ++ * Must be ->curr _and_ ->on_rq. If dequeued, we would ++ * project cycles that may never be accounted to this ++ * thread, breaking clock_gettime(). ++ */ ++ if (p == rq->curr && task_on_rq_queued(p)) { ++ update_rq_clock(rq); ++ update_curr(rq, p); ++ } ++ ns = tsk_seruntime(p); ++ task_access_unlock_irqrestore(p, lock, &flags); ++ ++ return ns; ++} ++ ++/* This manages tasks that have run out of timeslice during a scheduler_tick */ ++static inline void scheduler_task_tick(struct rq *rq) ++{ ++ struct task_struct *p = rq->curr; ++ ++ if (is_idle_task(p)) ++ return; ++ ++ update_curr(rq, p); ++ cpufreq_update_util(rq, 0); ++ ++ /* ++ * Tasks have less than RESCHED_NS of time slice left they will be ++ * rescheduled. ++ */ ++ if (p->time_slice >= RESCHED_NS) ++ return; ++ set_tsk_need_resched(p); ++ set_preempt_need_resched(); ++} ++ ++/* ++ * This function gets called by the timer code, with HZ frequency. ++ * We call it with interrupts disabled. ++ */ ++void scheduler_tick(void) ++{ ++ int cpu __maybe_unused = smp_processor_id(); ++ struct rq *rq = cpu_rq(cpu); ++ ++ sched_clock_tick(); ++ ++ raw_spin_lock(&rq->lock); ++ update_rq_clock(rq); ++ ++ scheduler_task_tick(rq); ++ calc_global_load_tick(rq); ++ psi_task_tick(rq); ++ ++ rq->last_tick = rq->clock; ++ raw_spin_unlock(&rq->lock); ++ ++ perf_event_task_tick(); ++} ++ ++#ifdef CONFIG_SCHED_SMT ++static inline int active_load_balance_cpu_stop(void *data) ++{ ++ struct rq *rq = this_rq(); ++ struct task_struct *p = data; ++ cpumask_t tmp; ++ unsigned long flags; ++ ++ local_irq_save(flags); ++ ++ raw_spin_lock(&p->pi_lock); ++ raw_spin_lock(&rq->lock); ++ ++ rq->active_balance = 0; ++ /* _something_ may have changed the task, double check again */ ++ if (task_on_rq_queued(p) && task_rq(p) == rq && ++ cpumask_and(&tmp, p->cpus_ptr, &sched_sg_idle_mask)) { ++ int cpu = cpu_of(rq); ++ int dcpu = __best_mask_cpu(cpu, &tmp, ++ per_cpu(sched_cpu_llc_mask, cpu)); ++ rq = move_queued_task(rq, p, dcpu); ++ } ++ ++ raw_spin_unlock(&rq->lock); ++ raw_spin_unlock(&p->pi_lock); ++ ++ local_irq_restore(flags); ++ ++ return 0; ++} ++ ++/* sg_balance_trigger - trigger slibing group balance for @cpu */ ++static inline int sg_balance_trigger(const int cpu) ++{ ++ struct rq *rq= cpu_rq(cpu); ++ unsigned long flags; ++ struct task_struct *curr; ++ int res; ++ ++ if (!raw_spin_trylock_irqsave(&rq->lock, flags)) ++ return 0; ++ curr = rq->curr; ++ res = (!is_idle_task(curr)) && (1 == rq->nr_running) &&\ ++ cpumask_intersects(curr->cpus_ptr, &sched_sg_idle_mask) &&\ ++ (!rq->active_balance); ++ ++ if (res) ++ rq->active_balance = 1; ++ ++ raw_spin_unlock_irqrestore(&rq->lock, flags); ++ ++ if (res) ++ stop_one_cpu_nowait(cpu, active_load_balance_cpu_stop, ++ curr, &rq->active_balance_work); ++ return res; ++} ++ ++/* ++ * sg_balance_check - slibing group balance check for run queue @rq ++ */ ++static inline void sg_balance_check(struct rq *rq) ++{ ++ cpumask_t chk; ++ int cpu; ++ ++ /* exit when no sg in idle */ ++ if (cpumask_empty(&sched_sg_idle_mask)) ++ return; ++ ++ cpu = cpu_of(rq); ++ /* ++ * Only cpu in slibing idle group will do the checking and then ++ * find potential cpus which can migrate the current running task ++ */ ++ if (cpumask_test_cpu(cpu, &sched_sg_idle_mask) && ++ cpumask_andnot(&chk, cpu_online_mask, &sched_rq_pending_mask) && ++ cpumask_andnot(&chk, &chk, &sched_rq_watermark[IDLE_WM])) { ++ int i, tried = 0; ++ ++ for_each_cpu_wrap(i, &chk, cpu) { ++ if (cpumask_subset(cpu_smt_mask(i), &chk)) { ++ if (sg_balance_trigger(i)) ++ return; ++ if (tried) ++ return; ++ tried++; ++ } ++ } ++ } ++} ++#endif /* CONFIG_SCHED_SMT */ ++ ++#ifdef CONFIG_NO_HZ_FULL ++ ++struct tick_work { ++ int cpu; ++ atomic_t state; ++ struct delayed_work work; ++}; ++/* Values for ->state, see diagram below. */ ++#define TICK_SCHED_REMOTE_OFFLINE 0 ++#define TICK_SCHED_REMOTE_OFFLINING 1 ++#define TICK_SCHED_REMOTE_RUNNING 2 ++ ++/* ++ * State diagram for ->state: ++ * ++ * ++ * TICK_SCHED_REMOTE_OFFLINE ++ * | ^ ++ * | | ++ * | | sched_tick_remote() ++ * | | ++ * | | ++ * +--TICK_SCHED_REMOTE_OFFLINING ++ * | ^ ++ * | | ++ * sched_tick_start() | | sched_tick_stop() ++ * | | ++ * V | ++ * TICK_SCHED_REMOTE_RUNNING ++ * ++ * ++ * Other transitions get WARN_ON_ONCE(), except that sched_tick_remote() ++ * and sched_tick_start() are happy to leave the state in RUNNING. ++ */ ++ ++static struct tick_work __percpu *tick_work_cpu; ++ ++static void sched_tick_remote(struct work_struct *work) ++{ ++ struct delayed_work *dwork = to_delayed_work(work); ++ struct tick_work *twork = container_of(dwork, struct tick_work, work); ++ int cpu = twork->cpu; ++ struct rq *rq = cpu_rq(cpu); ++ struct task_struct *curr; ++ unsigned long flags; ++ u64 delta; ++ int os; ++ ++ /* ++ * Handle the tick only if it appears the remote CPU is running in full ++ * dynticks mode. The check is racy by nature, but missing a tick or ++ * having one too much is no big deal because the scheduler tick updates ++ * statistics and checks timeslices in a time-independent way, regardless ++ * of when exactly it is running. ++ */ ++ if (idle_cpu(cpu) || !tick_nohz_tick_stopped_cpu(cpu)) ++ goto out_requeue; ++ ++ raw_spin_lock_irqsave(&rq->lock, flags); ++ curr = rq->curr; ++ ++ if (is_idle_task(curr) || cpu_is_offline(cpu)) ++ goto out_unlock; ++ ++ update_rq_clock(rq); ++ delta = rq_clock_task(rq) - curr->last_ran; ++ ++ /* ++ * Make sure the next tick runs within a reasonable ++ * amount of time. ++ */ ++ WARN_ON_ONCE(delta > (u64)NSEC_PER_SEC * 3); ++ scheduler_task_tick(rq); ++ ++out_unlock: ++ raw_spin_unlock_irqrestore(&rq->lock, flags); ++ ++out_requeue: ++ /* ++ * Run the remote tick once per second (1Hz). This arbitrary ++ * frequency is large enough to avoid overload but short enough ++ * to keep scheduler internal stats reasonably up to date. But ++ * first update state to reflect hotplug activity if required. ++ */ ++ os = atomic_fetch_add_unless(&twork->state, -1, TICK_SCHED_REMOTE_RUNNING); ++ WARN_ON_ONCE(os == TICK_SCHED_REMOTE_OFFLINE); ++ if (os == TICK_SCHED_REMOTE_RUNNING) ++ queue_delayed_work(system_unbound_wq, dwork, HZ); ++} ++ ++static void sched_tick_start(int cpu) ++{ ++ int os; ++ struct tick_work *twork; ++ ++ if (housekeeping_cpu(cpu, HK_FLAG_TICK)) ++ return; ++ ++ WARN_ON_ONCE(!tick_work_cpu); ++ ++ twork = per_cpu_ptr(tick_work_cpu, cpu); ++ os = atomic_xchg(&twork->state, TICK_SCHED_REMOTE_RUNNING); ++ WARN_ON_ONCE(os == TICK_SCHED_REMOTE_RUNNING); ++ if (os == TICK_SCHED_REMOTE_OFFLINE) { ++ twork->cpu = cpu; ++ INIT_DELAYED_WORK(&twork->work, sched_tick_remote); ++ queue_delayed_work(system_unbound_wq, &twork->work, HZ); ++ } ++} ++ ++#ifdef CONFIG_HOTPLUG_CPU ++static void sched_tick_stop(int cpu) ++{ ++ struct tick_work *twork; ++ ++ if (housekeeping_cpu(cpu, HK_FLAG_TICK)) ++ return; ++ ++ WARN_ON_ONCE(!tick_work_cpu); ++ ++ twork = per_cpu_ptr(tick_work_cpu, cpu); ++ cancel_delayed_work_sync(&twork->work); ++} ++#endif /* CONFIG_HOTPLUG_CPU */ ++ ++int __init sched_tick_offload_init(void) ++{ ++ tick_work_cpu = alloc_percpu(struct tick_work); ++ BUG_ON(!tick_work_cpu); ++ return 0; ++} ++ ++#else /* !CONFIG_NO_HZ_FULL */ ++static inline void sched_tick_start(int cpu) { } ++static inline void sched_tick_stop(int cpu) { } ++#endif ++ ++#if defined(CONFIG_PREEMPTION) && (defined(CONFIG_DEBUG_PREEMPT) || \ ++ defined(CONFIG_PREEMPT_TRACER)) ++/* ++ * If the value passed in is equal to the current preempt count ++ * then we just disabled preemption. Start timing the latency. ++ */ ++static inline void preempt_latency_start(int val) ++{ ++ if (preempt_count() == val) { ++ unsigned long ip = get_lock_parent_ip(); ++#ifdef CONFIG_DEBUG_PREEMPT ++ current->preempt_disable_ip = ip; ++#endif ++ trace_preempt_off(CALLER_ADDR0, ip); ++ } ++} ++ ++void preempt_count_add(int val) ++{ ++#ifdef CONFIG_DEBUG_PREEMPT ++ /* ++ * Underflow? ++ */ ++ if (DEBUG_LOCKS_WARN_ON((preempt_count() < 0))) ++ return; ++#endif ++ __preempt_count_add(val); ++#ifdef CONFIG_DEBUG_PREEMPT ++ /* ++ * Spinlock count overflowing soon? ++ */ ++ DEBUG_LOCKS_WARN_ON((preempt_count() & PREEMPT_MASK) >= ++ PREEMPT_MASK - 10); ++#endif ++ preempt_latency_start(val); ++} ++EXPORT_SYMBOL(preempt_count_add); ++NOKPROBE_SYMBOL(preempt_count_add); ++ ++/* ++ * If the value passed in equals to the current preempt count ++ * then we just enabled preemption. Stop timing the latency. ++ */ ++static inline void preempt_latency_stop(int val) ++{ ++ if (preempt_count() == val) ++ trace_preempt_on(CALLER_ADDR0, get_lock_parent_ip()); ++} ++ ++void preempt_count_sub(int val) ++{ ++#ifdef CONFIG_DEBUG_PREEMPT ++ /* ++ * Underflow? ++ */ ++ if (DEBUG_LOCKS_WARN_ON(val > preempt_count())) ++ return; ++ /* ++ * Is the spinlock portion underflowing? ++ */ ++ if (DEBUG_LOCKS_WARN_ON((val < PREEMPT_MASK) && ++ !(preempt_count() & PREEMPT_MASK))) ++ return; ++#endif ++ ++ preempt_latency_stop(val); ++ __preempt_count_sub(val); ++} ++EXPORT_SYMBOL(preempt_count_sub); ++NOKPROBE_SYMBOL(preempt_count_sub); ++ ++#else ++static inline void preempt_latency_start(int val) { } ++static inline void preempt_latency_stop(int val) { } ++#endif ++ ++static inline unsigned long get_preempt_disable_ip(struct task_struct *p) ++{ ++#ifdef CONFIG_DEBUG_PREEMPT ++ return p->preempt_disable_ip; ++#else ++ return 0; ++#endif ++} ++ ++/* ++ * Print scheduling while atomic bug: ++ */ ++static noinline void __schedule_bug(struct task_struct *prev) ++{ ++ /* Save this before calling printk(), since that will clobber it */ ++ unsigned long preempt_disable_ip = get_preempt_disable_ip(current); ++ ++ if (oops_in_progress) ++ return; ++ ++ printk(KERN_ERR "BUG: scheduling while atomic: %s/%d/0x%08x\n", ++ prev->comm, prev->pid, preempt_count()); ++ ++ debug_show_held_locks(prev); ++ print_modules(); ++ if (irqs_disabled()) ++ print_irqtrace_events(prev); ++ if (IS_ENABLED(CONFIG_DEBUG_PREEMPT) ++ && in_atomic_preempt_off()) { ++ pr_err("Preemption disabled at:"); ++ print_ip_sym(preempt_disable_ip); ++ pr_cont("\n"); ++ } ++ if (panic_on_warn) ++ panic("scheduling while atomic\n"); ++ ++ dump_stack(); ++ add_taint(TAINT_WARN, LOCKDEP_STILL_OK); ++} ++ ++/* ++ * Various schedule()-time debugging checks and statistics: ++ */ ++static inline void schedule_debug(struct task_struct *prev, bool preempt) ++{ ++#ifdef CONFIG_SCHED_STACK_END_CHECK ++ if (task_stack_end_corrupted(prev)) ++ panic("corrupted stack end detected inside scheduler\n"); ++#endif ++ ++#ifdef CONFIG_DEBUG_ATOMIC_SLEEP ++ if (!preempt && prev->state && prev->non_block_count) { ++ printk(KERN_ERR "BUG: scheduling in a non-blocking section: %s/%d/%i\n", ++ prev->comm, prev->pid, prev->non_block_count); ++ dump_stack(); ++ add_taint(TAINT_WARN, LOCKDEP_STILL_OK); ++ } ++#endif ++ ++ if (unlikely(in_atomic_preempt_off())) { ++ __schedule_bug(prev); ++ preempt_count_set(PREEMPT_DISABLED); ++ } ++ rcu_sleep_check(); ++ ++ profile_hit(SCHED_PROFILING, __builtin_return_address(0)); ++ ++ schedstat_inc(this_rq()->sched_count); ++} ++ ++#ifdef CONFIG_SMP ++ ++#define SCHED_RQ_NR_MIGRATION (32UL) ++/* ++ * Migrate pending tasks in @rq to @dest_cpu ++ * Will try to migrate mininal of half of @rq nr_running tasks and ++ * SCHED_RQ_NR_MIGRATION to @dest_cpu ++ */ ++static inline int ++migrate_pending_tasks(struct rq *rq, struct rq *dest_rq, const int dest_cpu) ++{ ++ struct task_struct *p, *skip = rq->curr; ++ int nr_migrated = 0; ++ int nr_tries = min(rq->nr_running / 2, SCHED_RQ_NR_MIGRATION); ++ ++ while (skip != rq->idle && nr_tries && ++ (p = rq_next_bmq_task(skip, rq)) != rq->idle) { ++ skip = rq_next_bmq_task(p, rq); ++ if (cpumask_test_cpu(dest_cpu, p->cpus_ptr)) { ++ dequeue_task(p, rq, 0); ++ set_task_cpu(p, dest_cpu); ++ enqueue_task(p, dest_rq, 0); ++ nr_migrated++; ++ } ++ nr_tries--; ++ } ++ ++ return nr_migrated; ++} ++ ++static inline int take_other_rq_tasks(struct rq *rq, int cpu) ++{ ++ struct cpumask *affinity_mask, *end_mask; ++ ++ if (cpumask_empty(&sched_rq_pending_mask)) ++ return 0; ++ ++ affinity_mask = &(per_cpu(sched_cpu_affinity_masks, cpu)[0]); ++ end_mask = per_cpu(sched_cpu_affinity_end_mask, cpu); ++ do { ++ int i; ++ for_each_cpu_and(i, &sched_rq_pending_mask, affinity_mask) { ++ int nr_migrated; ++ struct rq *src_rq; ++ ++ src_rq = cpu_rq(i); ++ if (!do_raw_spin_trylock(&src_rq->lock)) ++ continue; ++ spin_acquire(&src_rq->lock.dep_map, ++ SINGLE_DEPTH_NESTING, 1, _RET_IP_); ++ ++ nr_migrated = migrate_pending_tasks(src_rq, rq, cpu); ++ ++ spin_release(&src_rq->lock.dep_map, _RET_IP_); ++ do_raw_spin_unlock(&src_rq->lock); ++ ++ if (nr_migrated) { ++ cpufreq_update_util(rq, 0); ++ return 1; ++ } ++ } ++ } while (++affinity_mask < end_mask); ++ ++ return 0; ++} ++#endif ++ ++/* ++ * Timeslices below RESCHED_NS are considered as good as expired as there's no ++ * point rescheduling when there's so little time left. ++ */ ++static inline void check_curr(struct task_struct *p, struct rq *rq) ++{ ++ if (rq->idle == p) ++ return; ++ ++ update_curr(rq, p); ++ ++ if (p->time_slice < RESCHED_NS) { ++ p->time_slice = sched_timeslice_ns; ++ if (SCHED_FIFO != p->policy && task_on_rq_queued(p)) { ++ if (SCHED_RR != p->policy) ++ deboost_task(p); ++ requeue_task(p, rq); ++ } ++ } ++} ++ ++static inline struct task_struct * ++choose_next_task(struct rq *rq, int cpu, struct task_struct *prev) ++{ ++ struct task_struct *next; ++ ++ if (unlikely(rq->skip)) { ++ next = rq_runnable_task(rq); ++#ifdef CONFIG_SMP ++ if (likely(rq->online)) ++ if (next == rq->idle && take_other_rq_tasks(rq, cpu)) ++ next = rq_runnable_task(rq); ++#endif ++ rq->skip = NULL; ++ return next; ++ } ++ ++ next = rq_first_bmq_task(rq); ++#ifdef CONFIG_SMP ++ if (likely(rq->online)) ++ if (next == rq->idle && take_other_rq_tasks(rq, cpu)) ++ return rq_first_bmq_task(rq); ++#endif ++ return next; ++} ++ ++static inline void set_rq_task(struct rq *rq, struct task_struct *p) ++{ ++ p->last_ran = rq->clock_task; ++ ++ if (unlikely(sched_timeslice_ns == p->time_slice)) ++ rq->last_ts_switch = rq->clock; ++#ifdef CONFIG_HIGH_RES_TIMERS ++ if (p != rq->idle) ++ hrtick_start(rq, p->time_slice); ++#endif ++} ++ ++/* ++ * schedule() is the main scheduler function. ++ * ++ * The main means of driving the scheduler and thus entering this function are: ++ * ++ * 1. Explicit blocking: mutex, semaphore, waitqueue, etc. ++ * ++ * 2. TIF_NEED_RESCHED flag is checked on interrupt and userspace return ++ * paths. For example, see arch/x86/entry_64.S. ++ * ++ * To drive preemption between tasks, the scheduler sets the flag in timer ++ * interrupt handler scheduler_tick(). ++ * ++ * 3. Wakeups don't really cause entry into schedule(). They add a ++ * task to the run-queue and that's it. ++ * ++ * Now, if the new task added to the run-queue preempts the current ++ * task, then the wakeup sets TIF_NEED_RESCHED and schedule() gets ++ * called on the nearest possible occasion: ++ * ++ * - If the kernel is preemptible (CONFIG_PREEMPTION=y): ++ * ++ * - in syscall or exception context, at the next outmost ++ * preempt_enable(). (this might be as soon as the wake_up()'s ++ * spin_unlock()!) ++ * ++ * - in IRQ context, return from interrupt-handler to ++ * preemptible context ++ * ++ * - If the kernel is not preemptible (CONFIG_PREEMPTION is not set) ++ * then at the next: ++ * ++ * - cond_resched() call ++ * - explicit schedule() call ++ * - return from syscall or exception to user-space ++ * - return from interrupt-handler to user-space ++ * ++ * WARNING: must be called with preemption disabled! ++ */ ++static void __sched notrace __schedule(bool preempt) ++{ ++ struct task_struct *prev, *next; ++ unsigned long *switch_count; ++ struct rq *rq; ++ int cpu; ++ ++ cpu = smp_processor_id(); ++ rq = cpu_rq(cpu); ++ prev = rq->curr; ++ ++ schedule_debug(prev, preempt); ++ ++ /* by passing sched_feat(HRTICK) checking which BMQ doesn't support */ ++ hrtick_clear(rq); ++ ++ local_irq_disable(); ++ rcu_note_context_switch(preempt); ++ ++ /* ++ * Make sure that signal_pending_state()->signal_pending() below ++ * can't be reordered with __set_current_state(TASK_INTERRUPTIBLE) ++ * done by the caller to avoid the race with signal_wake_up(). ++ * ++ * The membarrier system call requires a full memory barrier ++ * after coming from user-space, before storing to rq->curr. ++ */ ++ raw_spin_lock(&rq->lock); ++ smp_mb__after_spinlock(); ++ ++ update_rq_clock(rq); ++ ++ switch_count = &prev->nivcsw; ++ if (!preempt && prev->state) { ++ if (signal_pending_state(prev->state, prev)) { ++ prev->state = TASK_RUNNING; ++ } else { ++ if (rq_switch_time(rq) < boost_threshold(prev)) ++ boost_task(prev); ++ deactivate_task(prev, rq); ++ ++ if (prev->in_iowait) { ++ atomic_inc(&rq->nr_iowait); ++ delayacct_blkio_start(); ++ } ++ } ++ switch_count = &prev->nvcsw; ++ } ++ ++ clear_tsk_need_resched(prev); ++ clear_preempt_need_resched(); ++ ++ check_curr(prev, rq); ++ ++ next = choose_next_task(rq, cpu, prev); ++ ++ set_rq_task(rq, next); ++ ++ if (prev != next) { ++ if (MAX_PRIO == next->prio) ++ schedstat_inc(rq->sched_goidle); ++ ++ /* ++ * RCU users of rcu_dereference(rq->curr) may not see ++ * changes to task_struct made by pick_next_task(). ++ */ ++ RCU_INIT_POINTER(rq->curr, next); ++ /* ++ * The membarrier system call requires each architecture ++ * to have a full memory barrier after updating ++ * rq->curr, before returning to user-space. ++ * ++ * Here are the schemes providing that barrier on the ++ * various architectures: ++ * - mm ? switch_mm() : mmdrop() for x86, s390, sparc, PowerPC. ++ * switch_mm() rely on membarrier_arch_switch_mm() on PowerPC. ++ * - finish_lock_switch() for weakly-ordered ++ * architectures where spin_unlock is a full barrier, ++ * - switch_to() for arm64 (weakly-ordered, spin_unlock ++ * is a RELEASE barrier), ++ */ ++ ++*switch_count; ++ rq->nr_switches++; ++ rq->last_ts_switch = rq->clock; ++ ++ trace_sched_switch(preempt, prev, next); ++ ++ /* Also unlocks the rq: */ ++ rq = context_switch(rq, prev, next); ++#ifdef CONFIG_SCHED_SMT ++ sg_balance_check(rq); ++#endif ++ } else ++ raw_spin_unlock_irq(&rq->lock); ++} ++ ++void __noreturn do_task_dead(void) ++{ ++ /* Causes final put_task_struct in finish_task_switch(): */ ++ set_special_state(TASK_DEAD); ++ ++ /* Tell freezer to ignore us: */ ++ current->flags |= PF_NOFREEZE; ++ ++ __schedule(false); ++ BUG(); ++ ++ /* Avoid "noreturn function does return" - but don't continue if BUG() is a NOP: */ ++ for (;;) ++ cpu_relax(); ++} ++ ++static inline void sched_submit_work(struct task_struct *tsk) ++{ ++ if (!tsk->state) ++ return; ++ ++ /* ++ * If a worker went to sleep, notify and ask workqueue whether ++ * it wants to wake up a task to maintain concurrency. ++ * As this function is called inside the schedule() context, ++ * we disable preemption to avoid it calling schedule() again ++ * in the possible wakeup of a kworker. ++ */ ++ if (tsk->flags & (PF_WQ_WORKER | PF_IO_WORKER)) { ++ preempt_disable(); ++ if (tsk->flags & PF_WQ_WORKER) ++ wq_worker_sleeping(tsk); ++ else ++ io_wq_worker_sleeping(tsk); ++ preempt_enable_no_resched(); ++ } ++ ++ if (tsk_is_pi_blocked(tsk)) ++ return; ++ ++ /* ++ * If we are going to sleep and we have plugged IO queued, ++ * make sure to submit it to avoid deadlocks. ++ */ ++ if (blk_needs_flush_plug(tsk)) ++ blk_schedule_flush_plug(tsk); ++} ++ ++static void sched_update_worker(struct task_struct *tsk) ++{ ++ if (tsk->flags & (PF_WQ_WORKER | PF_IO_WORKER)) { ++ if (tsk->flags & PF_WQ_WORKER) ++ wq_worker_running(tsk); ++ else ++ io_wq_worker_running(tsk); ++ } ++} ++ ++asmlinkage __visible void __sched schedule(void) ++{ ++ struct task_struct *tsk = current; ++ ++ sched_submit_work(tsk); ++ do { ++ preempt_disable(); ++ __schedule(false); ++ sched_preempt_enable_no_resched(); ++ } while (need_resched()); ++ sched_update_worker(tsk); ++} ++EXPORT_SYMBOL(schedule); ++ ++/* ++ * synchronize_rcu_tasks() makes sure that no task is stuck in preempted ++ * state (have scheduled out non-voluntarily) by making sure that all ++ * tasks have either left the run queue or have gone into user space. ++ * As idle tasks do not do either, they must not ever be preempted ++ * (schedule out non-voluntarily). ++ * ++ * schedule_idle() is similar to schedule_preempt_disable() except that it ++ * never enables preemption because it does not call sched_submit_work(). ++ */ ++void __sched schedule_idle(void) ++{ ++ /* ++ * As this skips calling sched_submit_work(), which the idle task does ++ * regardless because that function is a nop when the task is in a ++ * TASK_RUNNING state, make sure this isn't used someplace that the ++ * current task can be in any other state. Note, idle is always in the ++ * TASK_RUNNING state. ++ */ ++ WARN_ON_ONCE(current->state); ++ do { ++ __schedule(false); ++ } while (need_resched()); ++} ++ ++#ifdef CONFIG_CONTEXT_TRACKING ++asmlinkage __visible void __sched schedule_user(void) ++{ ++ /* ++ * If we come here after a random call to set_need_resched(), ++ * or we have been woken up remotely but the IPI has not yet arrived, ++ * we haven't yet exited the RCU idle mode. Do it here manually until ++ * we find a better solution. ++ * ++ * NB: There are buggy callers of this function. Ideally we ++ * should warn if prev_state != CONTEXT_USER, but that will trigger ++ * too frequently to make sense yet. ++ */ ++ enum ctx_state prev_state = exception_enter(); ++ schedule(); ++ exception_exit(prev_state); ++} ++#endif ++ ++/** ++ * schedule_preempt_disabled - called with preemption disabled ++ * ++ * Returns with preemption disabled. Note: preempt_count must be 1 ++ */ ++void __sched schedule_preempt_disabled(void) ++{ ++ sched_preempt_enable_no_resched(); ++ schedule(); ++ preempt_disable(); ++} ++ ++static void __sched notrace preempt_schedule_common(void) ++{ ++ do { ++ /* ++ * Because the function tracer can trace preempt_count_sub() ++ * and it also uses preempt_enable/disable_notrace(), if ++ * NEED_RESCHED is set, the preempt_enable_notrace() called ++ * by the function tracer will call this function again and ++ * cause infinite recursion. ++ * ++ * Preemption must be disabled here before the function ++ * tracer can trace. Break up preempt_disable() into two ++ * calls. One to disable preemption without fear of being ++ * traced. The other to still record the preemption latency, ++ * which can also be traced by the function tracer. ++ */ ++ preempt_disable_notrace(); ++ preempt_latency_start(1); ++ __schedule(true); ++ preempt_latency_stop(1); ++ preempt_enable_no_resched_notrace(); ++ ++ /* ++ * Check again in case we missed a preemption opportunity ++ * between schedule and now. ++ */ ++ } while (need_resched()); ++} ++ ++#ifdef CONFIG_PREEMPTION ++/* ++ * This is the entry point to schedule() from in-kernel preemption ++ * off of preempt_enable. ++ */ ++asmlinkage __visible void __sched notrace preempt_schedule(void) ++{ ++ /* ++ * If there is a non-zero preempt_count or interrupts are disabled, ++ * we do not want to preempt the current task. Just return.. ++ */ ++ if (likely(!preemptible())) ++ return; ++ ++ preempt_schedule_common(); ++} ++NOKPROBE_SYMBOL(preempt_schedule); ++EXPORT_SYMBOL(preempt_schedule); ++ ++/** ++ * preempt_schedule_notrace - preempt_schedule called by tracing ++ * ++ * The tracing infrastructure uses preempt_enable_notrace to prevent ++ * recursion and tracing preempt enabling caused by the tracing ++ * infrastructure itself. But as tracing can happen in areas coming ++ * from userspace or just about to enter userspace, a preempt enable ++ * can occur before user_exit() is called. This will cause the scheduler ++ * to be called when the system is still in usermode. ++ * ++ * To prevent this, the preempt_enable_notrace will use this function ++ * instead of preempt_schedule() to exit user context if needed before ++ * calling the scheduler. ++ */ ++asmlinkage __visible void __sched notrace preempt_schedule_notrace(void) ++{ ++ enum ctx_state prev_ctx; ++ ++ if (likely(!preemptible())) ++ return; ++ ++ do { ++ /* ++ * Because the function tracer can trace preempt_count_sub() ++ * and it also uses preempt_enable/disable_notrace(), if ++ * NEED_RESCHED is set, the preempt_enable_notrace() called ++ * by the function tracer will call this function again and ++ * cause infinite recursion. ++ * ++ * Preemption must be disabled here before the function ++ * tracer can trace. Break up preempt_disable() into two ++ * calls. One to disable preemption without fear of being ++ * traced. The other to still record the preemption latency, ++ * which can also be traced by the function tracer. ++ */ ++ preempt_disable_notrace(); ++ preempt_latency_start(1); ++ /* ++ * Needs preempt disabled in case user_exit() is traced ++ * and the tracer calls preempt_enable_notrace() causing ++ * an infinite recursion. ++ */ ++ prev_ctx = exception_enter(); ++ __schedule(true); ++ exception_exit(prev_ctx); ++ ++ preempt_latency_stop(1); ++ preempt_enable_no_resched_notrace(); ++ } while (need_resched()); ++} ++EXPORT_SYMBOL_GPL(preempt_schedule_notrace); ++ ++#endif /* CONFIG_PREEMPTION */ ++ ++/* ++ * This is the entry point to schedule() from kernel preemption ++ * off of irq context. ++ * Note, that this is called and return with irqs disabled. This will ++ * protect us against recursive calling from irq. ++ */ ++asmlinkage __visible void __sched preempt_schedule_irq(void) ++{ ++ enum ctx_state prev_state; ++ ++ /* Catch callers which need to be fixed */ ++ BUG_ON(preempt_count() || !irqs_disabled()); ++ ++ prev_state = exception_enter(); ++ ++ do { ++ preempt_disable(); ++ local_irq_enable(); ++ __schedule(true); ++ local_irq_disable(); ++ sched_preempt_enable_no_resched(); ++ } while (need_resched()); ++ ++ exception_exit(prev_state); ++} ++ ++int default_wake_function(wait_queue_entry_t *curr, unsigned mode, int wake_flags, ++ void *key) ++{ ++ return try_to_wake_up(curr->private, mode, wake_flags); ++} ++EXPORT_SYMBOL(default_wake_function); ++ ++static inline void check_task_changed(struct rq *rq, struct task_struct *p) ++{ ++ /* Trigger resched if task sched_prio has been modified. */ ++ if (task_on_rq_queued(p) && task_sched_prio(p) != p->bmq_idx) { ++ requeue_task(p, rq); ++ check_preempt_curr(rq); ++ } ++} ++ ++#ifdef CONFIG_RT_MUTEXES ++ ++static inline int __rt_effective_prio(struct task_struct *pi_task, int prio) ++{ ++ if (pi_task) ++ prio = min(prio, pi_task->prio); ++ ++ return prio; ++} ++ ++static inline int rt_effective_prio(struct task_struct *p, int prio) ++{ ++ struct task_struct *pi_task = rt_mutex_get_top_task(p); ++ ++ return __rt_effective_prio(pi_task, prio); ++} ++ ++/* ++ * rt_mutex_setprio - set the current priority of a task ++ * @p: task to boost ++ * @pi_task: donor task ++ * ++ * This function changes the 'effective' priority of a task. It does ++ * not touch ->normal_prio like __setscheduler(). ++ * ++ * Used by the rt_mutex code to implement priority inheritance ++ * logic. Call site only calls if the priority of the task changed. ++ */ ++void rt_mutex_setprio(struct task_struct *p, struct task_struct *pi_task) ++{ ++ int prio; ++ struct rq *rq; ++ raw_spinlock_t *lock; ++ ++ /* XXX used to be waiter->prio, not waiter->task->prio */ ++ prio = __rt_effective_prio(pi_task, p->normal_prio); ++ ++ /* ++ * If nothing changed; bail early. ++ */ ++ if (p->pi_top_task == pi_task && prio == p->prio) ++ return; ++ ++ rq = __task_access_lock(p, &lock); ++ /* ++ * Set under pi_lock && rq->lock, such that the value can be used under ++ * either lock. ++ * ++ * Note that there is loads of tricky to make this pointer cache work ++ * right. rt_mutex_slowunlock()+rt_mutex_postunlock() work together to ++ * ensure a task is de-boosted (pi_task is set to NULL) before the ++ * task is allowed to run again (and can exit). This ensures the pointer ++ * points to a blocked task -- which guaratees the task is present. ++ */ ++ p->pi_top_task = pi_task; ++ ++ /* ++ * For FIFO/RR we only need to set prio, if that matches we're done. ++ */ ++ if (prio == p->prio) ++ goto out_unlock; ++ ++ /* ++ * Idle task boosting is a nono in general. There is one ++ * exception, when PREEMPT_RT and NOHZ is active: ++ * ++ * The idle task calls get_next_timer_interrupt() and holds ++ * the timer wheel base->lock on the CPU and another CPU wants ++ * to access the timer (probably to cancel it). We can safely ++ * ignore the boosting request, as the idle CPU runs this code ++ * with interrupts disabled and will complete the lock ++ * protected section without being interrupted. So there is no ++ * real need to boost. ++ */ ++ if (unlikely(p == rq->idle)) { ++ WARN_ON(p != rq->curr); ++ WARN_ON(p->pi_blocked_on); ++ goto out_unlock; ++ } ++ ++ trace_sched_pi_setprio(p, pi_task); ++ p->prio = prio; ++ ++ check_task_changed(rq, p); ++out_unlock: ++ __task_access_unlock(p, lock); ++} ++#else ++static inline int rt_effective_prio(struct task_struct *p, int prio) ++{ ++ return prio; ++} ++#endif ++ ++void set_user_nice(struct task_struct *p, long nice) ++{ ++ unsigned long flags; ++ struct rq *rq; ++ raw_spinlock_t *lock; ++ ++ if (task_nice(p) == nice || nice < MIN_NICE || nice > MAX_NICE) ++ return; ++ /* ++ * We have to be careful, if called from sys_setpriority(), ++ * the task might be in the middle of scheduling on another CPU. ++ */ ++ raw_spin_lock_irqsave(&p->pi_lock, flags); ++ rq = __task_access_lock(p, &lock); ++ ++ p->static_prio = NICE_TO_PRIO(nice); ++ /* ++ * The RT priorities are set via sched_setscheduler(), but we still ++ * allow the 'normal' nice value to be set - but as expected ++ * it wont have any effect on scheduling until the task is ++ * not SCHED_NORMAL/SCHED_BATCH: ++ */ ++ if (task_has_rt_policy(p)) ++ goto out_unlock; ++ ++ p->prio = effective_prio(p); ++ check_task_changed(rq, p); ++out_unlock: ++ __task_access_unlock(p, lock); ++ raw_spin_unlock_irqrestore(&p->pi_lock, flags); ++} ++EXPORT_SYMBOL(set_user_nice); ++ ++/* ++ * can_nice - check if a task can reduce its nice value ++ * @p: task ++ * @nice: nice value ++ */ ++int can_nice(const struct task_struct *p, const int nice) ++{ ++ /* Convert nice value [19,-20] to rlimit style value [1,40] */ ++ int nice_rlim = nice_to_rlimit(nice); ++ ++ return (nice_rlim <= task_rlimit(p, RLIMIT_NICE) || ++ capable(CAP_SYS_NICE)); ++} ++ ++#ifdef __ARCH_WANT_SYS_NICE ++ ++/* ++ * sys_nice - change the priority of the current process. ++ * @increment: priority increment ++ * ++ * sys_setpriority is a more generic, but much slower function that ++ * does similar things. ++ */ ++SYSCALL_DEFINE1(nice, int, increment) ++{ ++ long nice, retval; ++ ++ /* ++ * Setpriority might change our priority at the same moment. ++ * We don't have to worry. Conceptually one call occurs first ++ * and we have a single winner. ++ */ ++ ++ increment = clamp(increment, -NICE_WIDTH, NICE_WIDTH); ++ nice = task_nice(current) + increment; ++ ++ nice = clamp_val(nice, MIN_NICE, MAX_NICE); ++ if (increment < 0 && !can_nice(current, nice)) ++ return -EPERM; ++ ++ retval = security_task_setnice(current, nice); ++ if (retval) ++ return retval; ++ ++ set_user_nice(current, nice); ++ return 0; ++} ++ ++#endif ++ ++/** ++ * task_prio - return the priority value of a given task. ++ * @p: the task in question. ++ * ++ * Return: The priority value as seen by users in /proc. ++ * RT tasks are offset by -100. Normal tasks are centered around 1, value goes ++ * from 0(SCHED_ISO) up to 82 (nice +19 SCHED_IDLE). ++ */ ++int task_prio(const struct task_struct *p) ++{ ++ if (p->prio < MAX_RT_PRIO) ++ return (p->prio - MAX_RT_PRIO); ++ return (p->prio - MAX_RT_PRIO + p->boost_prio); ++} ++ ++/** ++ * idle_cpu - is a given CPU idle currently? ++ * @cpu: the processor in question. ++ * ++ * Return: 1 if the CPU is currently idle. 0 otherwise. ++ */ ++int idle_cpu(int cpu) ++{ ++ return cpu_curr(cpu) == cpu_rq(cpu)->idle; ++} ++ ++/** ++ * idle_task - return the idle task for a given CPU. ++ * @cpu: the processor in question. ++ * ++ * Return: The idle task for the cpu @cpu. ++ */ ++struct task_struct *idle_task(int cpu) ++{ ++ return cpu_rq(cpu)->idle; ++} ++ ++/** ++ * find_process_by_pid - find a process with a matching PID value. ++ * @pid: the pid in question. ++ * ++ * The task of @pid, if found. %NULL otherwise. ++ */ ++static inline struct task_struct *find_process_by_pid(pid_t pid) ++{ ++ return pid ? find_task_by_vpid(pid) : current; ++} ++ ++/* ++ * sched_setparam() passes in -1 for its policy, to let the functions ++ * it calls know not to change it. ++ */ ++#define SETPARAM_POLICY -1 ++ ++static void __setscheduler_params(struct task_struct *p, ++ const struct sched_attr *attr) ++{ ++ int policy = attr->sched_policy; ++ ++ if (policy == SETPARAM_POLICY) ++ policy = p->policy; ++ ++ p->policy = policy; ++ ++ /* ++ * allow normal nice value to be set, but will not have any ++ * effect on scheduling until the task not SCHED_NORMAL/ ++ * SCHED_BATCH ++ */ ++ p->static_prio = NICE_TO_PRIO(attr->sched_nice); ++ ++ /* ++ * __sched_setscheduler() ensures attr->sched_priority == 0 when ++ * !rt_policy. Always setting this ensures that things like ++ * getparam()/getattr() don't report silly values for !rt tasks. ++ */ ++ p->rt_priority = attr->sched_priority; ++ p->normal_prio = normal_prio(p); ++} ++ ++/* Actually do priority change: must hold rq lock. */ ++static void __setscheduler(struct rq *rq, struct task_struct *p, ++ const struct sched_attr *attr, bool keep_boost) ++{ ++ __setscheduler_params(p, attr); ++ ++ /* ++ * Keep a potential priority boosting if called from ++ * sched_setscheduler(). ++ */ ++ p->prio = normal_prio(p); ++ if (keep_boost) ++ p->prio = rt_effective_prio(p, p->prio); ++} ++ ++/* ++ * check the target process has a UID that matches the current process's ++ */ ++static bool check_same_owner(struct task_struct *p) ++{ ++ const struct cred *cred = current_cred(), *pcred; ++ bool match; ++ ++ rcu_read_lock(); ++ pcred = __task_cred(p); ++ match = (uid_eq(cred->euid, pcred->euid) || ++ uid_eq(cred->euid, pcred->uid)); ++ rcu_read_unlock(); ++ return match; ++} ++ ++static int __sched_setscheduler(struct task_struct *p, ++ const struct sched_attr *attr, ++ bool user, bool pi) ++{ ++ const struct sched_attr dl_squash_attr = { ++ .size = sizeof(struct sched_attr), ++ .sched_policy = SCHED_FIFO, ++ .sched_nice = 0, ++ .sched_priority = 99, ++ }; ++ int newprio = MAX_RT_PRIO - 1 - attr->sched_priority; ++ int retval, oldpolicy = -1; ++ int policy = attr->sched_policy; ++ unsigned long flags; ++ struct rq *rq; ++ int reset_on_fork; ++ raw_spinlock_t *lock; ++ ++ /* The pi code expects interrupts enabled */ ++ BUG_ON(pi && in_interrupt()); ++ ++ /* ++ * BMQ supports SCHED_DEADLINE by squash it as prio 0 SCHED_FIFO ++ */ ++ if (unlikely(SCHED_DEADLINE == policy)) { ++ attr = &dl_squash_attr; ++ policy = attr->sched_policy; ++ newprio = MAX_RT_PRIO - 1 - attr->sched_priority; ++ } ++recheck: ++ /* Double check policy once rq lock held */ ++ if (policy < 0) { ++ reset_on_fork = p->sched_reset_on_fork; ++ policy = oldpolicy = p->policy; ++ } else { ++ reset_on_fork = !!(attr->sched_flags & SCHED_RESET_ON_FORK); ++ ++ if (policy > SCHED_IDLE) ++ return -EINVAL; ++ } ++ ++ if (attr->sched_flags & ~(SCHED_FLAG_ALL)) ++ return -EINVAL; ++ ++ /* ++ * Valid priorities for SCHED_FIFO and SCHED_RR are ++ * 1..MAX_USER_RT_PRIO-1, valid priority for SCHED_NORMAL and ++ * SCHED_BATCH and SCHED_IDLE is 0. ++ */ ++ if (attr->sched_priority < 0 || ++ (p->mm && attr->sched_priority > MAX_USER_RT_PRIO - 1) || ++ (!p->mm && attr->sched_priority > MAX_RT_PRIO - 1)) ++ return -EINVAL; ++ if ((SCHED_RR == policy || SCHED_FIFO == policy) != ++ (attr->sched_priority != 0)) ++ return -EINVAL; ++ ++ /* ++ * Allow unprivileged RT tasks to decrease priority: ++ */ ++ if (user && !capable(CAP_SYS_NICE)) { ++ if (SCHED_FIFO == policy || SCHED_RR == policy) { ++ unsigned long rlim_rtprio = ++ task_rlimit(p, RLIMIT_RTPRIO); ++ ++ /* Can't set/change the rt policy */ ++ if (policy != p->policy && !rlim_rtprio) ++ return -EPERM; ++ ++ /* Can't increase priority */ ++ if (attr->sched_priority > p->rt_priority && ++ attr->sched_priority > rlim_rtprio) ++ return -EPERM; ++ } ++ ++ /* Can't change other user's priorities */ ++ if (!check_same_owner(p)) ++ return -EPERM; ++ ++ /* Normal users shall not reset the sched_reset_on_fork flag */ ++ if (p->sched_reset_on_fork && !reset_on_fork) ++ return -EPERM; ++ } ++ ++ if (user) { ++ retval = security_task_setscheduler(p); ++ if (retval) ++ return retval; ++ } ++ ++ if (pi) ++ cpuset_read_lock(); ++ ++ /* ++ * Make sure no PI-waiters arrive (or leave) while we are ++ * changing the priority of the task: ++ */ ++ raw_spin_lock_irqsave(&p->pi_lock, flags); ++ ++ /* ++ * To be able to change p->policy safely, task_access_lock() ++ * must be called. ++ * IF use task_access_lock() here: ++ * For the task p which is not running, reading rq->stop is ++ * racy but acceptable as ->stop doesn't change much. ++ * An enhancemnet can be made to read rq->stop saftly. ++ */ ++ rq = __task_access_lock(p, &lock); ++ ++ /* ++ * Changing the policy of the stop threads its a very bad idea ++ */ ++ if (p == rq->stop) { ++ retval = -EINVAL; ++ goto unlock; ++ } ++ ++ /* ++ * If not changing anything there's no need to proceed further: ++ */ ++ if (unlikely(policy == p->policy)) { ++ if (rt_policy(policy) && attr->sched_priority != p->rt_priority) ++ goto change; ++ if (!rt_policy(policy) && ++ NICE_TO_PRIO(attr->sched_nice) != p->static_prio) ++ goto change; ++ ++ p->sched_reset_on_fork = reset_on_fork; ++ retval = 0; ++ goto unlock; ++ } ++change: ++ ++ /* Re-check policy now with rq lock held */ ++ if (unlikely(oldpolicy != -1 && oldpolicy != p->policy)) { ++ policy = oldpolicy = -1; ++ __task_access_unlock(p, lock); ++ raw_spin_unlock_irqrestore(&p->pi_lock, flags); ++ if (pi) ++ cpuset_read_unlock(); ++ goto recheck; ++ } ++ ++ p->sched_reset_on_fork = reset_on_fork; ++ ++ if (pi) { ++ /* ++ * Take priority boosted tasks into account. If the new ++ * effective priority is unchanged, we just store the new ++ * normal parameters and do not touch the scheduler class and ++ * the runqueue. This will be done when the task deboost ++ * itself. ++ */ ++ if (rt_effective_prio(p, newprio) == p->prio) { ++ __setscheduler_params(p, attr); ++ retval = 0; ++ goto unlock; ++ } ++ } ++ ++ __setscheduler(rq, p, attr, pi); ++ ++ check_task_changed(rq, p); ++ ++ /* Avoid rq from going away on us: */ ++ preempt_disable(); ++ __task_access_unlock(p, lock); ++ raw_spin_unlock_irqrestore(&p->pi_lock, flags); ++ ++ if (pi) { ++ cpuset_read_unlock(); ++ rt_mutex_adjust_pi(p); ++ } ++ ++ preempt_enable(); ++ ++ return 0; ++ ++unlock: ++ __task_access_unlock(p, lock); ++ raw_spin_unlock_irqrestore(&p->pi_lock, flags); ++ if (pi) ++ cpuset_read_unlock(); ++ return retval; ++} ++ ++static int _sched_setscheduler(struct task_struct *p, int policy, ++ const struct sched_param *param, bool check) ++{ ++ struct sched_attr attr = { ++ .sched_policy = policy, ++ .sched_priority = param->sched_priority, ++ .sched_nice = PRIO_TO_NICE(p->static_prio), ++ }; ++ ++ /* Fixup the legacy SCHED_RESET_ON_FORK hack. */ ++ if ((policy != SETPARAM_POLICY) && (policy & SCHED_RESET_ON_FORK)) { ++ attr.sched_flags |= SCHED_FLAG_RESET_ON_FORK; ++ policy &= ~SCHED_RESET_ON_FORK; ++ attr.sched_policy = policy; ++ } ++ ++ return __sched_setscheduler(p, &attr, check, true); ++} ++ ++/** ++ * sched_setscheduler - change the scheduling policy and/or RT priority of a thread. ++ * @p: the task in question. ++ * @policy: new policy. ++ * @param: structure containing the new RT priority. ++ * ++ * Return: 0 on success. An error code otherwise. ++ * ++ * NOTE that the task may be already dead. ++ */ ++int sched_setscheduler(struct task_struct *p, int policy, ++ const struct sched_param *param) ++{ ++ return _sched_setscheduler(p, policy, param, true); ++} ++ ++EXPORT_SYMBOL_GPL(sched_setscheduler); ++ ++int sched_setattr(struct task_struct *p, const struct sched_attr *attr) ++{ ++ return __sched_setscheduler(p, attr, true, true); ++} ++EXPORT_SYMBOL_GPL(sched_setattr); ++ ++int sched_setattr_nocheck(struct task_struct *p, const struct sched_attr *attr) ++{ ++ return __sched_setscheduler(p, attr, false, true); ++} ++ ++/** ++ * sched_setscheduler_nocheck - change the scheduling policy and/or RT priority of a thread from kernelspace. ++ * @p: the task in question. ++ * @policy: new policy. ++ * @param: structure containing the new RT priority. ++ * ++ * Just like sched_setscheduler, only don't bother checking if the ++ * current context has permission. For example, this is needed in ++ * stop_machine(): we create temporary high priority worker threads, ++ * but our caller might not have that capability. ++ * ++ * Return: 0 on success. An error code otherwise. ++ */ ++int sched_setscheduler_nocheck(struct task_struct *p, int policy, ++ const struct sched_param *param) ++{ ++ return _sched_setscheduler(p, policy, param, false); ++} ++EXPORT_SYMBOL_GPL(sched_setscheduler_nocheck); ++ ++static int ++do_sched_setscheduler(pid_t pid, int policy, struct sched_param __user *param) ++{ ++ struct sched_param lparam; ++ struct task_struct *p; ++ int retval; ++ ++ if (!param || pid < 0) ++ return -EINVAL; ++ if (copy_from_user(&lparam, param, sizeof(struct sched_param))) ++ return -EFAULT; ++ ++ rcu_read_lock(); ++ retval = -ESRCH; ++ p = find_process_by_pid(pid); ++ if (likely(p)) ++ get_task_struct(p); ++ rcu_read_unlock(); ++ ++ if (likely(p)) { ++ retval = sched_setscheduler(p, policy, &lparam); ++ put_task_struct(p); ++ } ++ ++ return retval; ++} ++ ++/* ++ * Mimics kernel/events/core.c perf_copy_attr(). ++ */ ++static int sched_copy_attr(struct sched_attr __user *uattr, struct sched_attr *attr) ++{ ++ u32 size; ++ int ret; ++ ++ /* Zero the full structure, so that a short copy will be nice: */ ++ memset(attr, 0, sizeof(*attr)); ++ ++ ret = get_user(size, &uattr->size); ++ if (ret) ++ return ret; ++ ++ /* ABI compatibility quirk: */ ++ if (!size) ++ size = SCHED_ATTR_SIZE_VER0; ++ ++ if (size < SCHED_ATTR_SIZE_VER0 || size > PAGE_SIZE) ++ goto err_size; ++ ++ ret = copy_struct_from_user(attr, sizeof(*attr), uattr, size); ++ if (ret) { ++ if (ret == -E2BIG) ++ goto err_size; ++ return ret; ++ } ++ ++ /* ++ * XXX: Do we want to be lenient like existing syscalls; or do we want ++ * to be strict and return an error on out-of-bounds values? ++ */ ++ attr->sched_nice = clamp(attr->sched_nice, -20, 19); ++ ++ /* sched/core.c uses zero here but we already know ret is zero */ ++ return 0; ++ ++err_size: ++ put_user(sizeof(*attr), &uattr->size); ++ return -E2BIG; ++} ++ ++/** ++ * sys_sched_setscheduler - set/change the scheduler policy and RT priority ++ * @pid: the pid in question. ++ * @policy: new policy. ++ * ++ * Return: 0 on success. An error code otherwise. ++ * @param: structure containing the new RT priority. ++ */ ++SYSCALL_DEFINE3(sched_setscheduler, pid_t, pid, int, policy, struct sched_param __user *, param) ++{ ++ if (policy < 0) ++ return -EINVAL; ++ ++ return do_sched_setscheduler(pid, policy, param); ++} ++ ++/** ++ * sys_sched_setparam - set/change the RT priority of a thread ++ * @pid: the pid in question. ++ * @param: structure containing the new RT priority. ++ * ++ * Return: 0 on success. An error code otherwise. ++ */ ++SYSCALL_DEFINE2(sched_setparam, pid_t, pid, struct sched_param __user *, param) ++{ ++ return do_sched_setscheduler(pid, SETPARAM_POLICY, param); ++} ++ ++/** ++ * sys_sched_setattr - same as above, but with extended sched_attr ++ * @pid: the pid in question. ++ * @uattr: structure containing the extended parameters. ++ */ ++SYSCALL_DEFINE3(sched_setattr, pid_t, pid, struct sched_attr __user *, uattr, ++ unsigned int, flags) ++{ ++ struct sched_attr attr; ++ struct task_struct *p; ++ int retval; ++ ++ if (!uattr || pid < 0 || flags) ++ return -EINVAL; ++ ++ retval = sched_copy_attr(uattr, &attr); ++ if (retval) ++ return retval; ++ ++ if ((int)attr.sched_policy < 0) ++ return -EINVAL; ++ ++ rcu_read_lock(); ++ retval = -ESRCH; ++ p = find_process_by_pid(pid); ++ if (p != NULL) ++ retval = sched_setattr(p, &attr); ++ rcu_read_unlock(); ++ ++ return retval; ++} ++ ++/** ++ * sys_sched_getscheduler - get the policy (scheduling class) of a thread ++ * @pid: the pid in question. ++ * ++ * Return: On success, the policy of the thread. Otherwise, a negative error ++ * code. ++ */ ++SYSCALL_DEFINE1(sched_getscheduler, pid_t, pid) ++{ ++ struct task_struct *p; ++ int retval = -EINVAL; ++ ++ if (pid < 0) ++ goto out_nounlock; ++ ++ retval = -ESRCH; ++ rcu_read_lock(); ++ p = find_process_by_pid(pid); ++ if (p) { ++ retval = security_task_getscheduler(p); ++ if (!retval) ++ retval = p->policy; ++ } ++ rcu_read_unlock(); ++ ++out_nounlock: ++ return retval; ++} ++ ++/** ++ * sys_sched_getscheduler - get the RT priority of a thread ++ * @pid: the pid in question. ++ * @param: structure containing the RT priority. ++ * ++ * Return: On success, 0 and the RT priority is in @param. Otherwise, an error ++ * code. ++ */ ++SYSCALL_DEFINE2(sched_getparam, pid_t, pid, struct sched_param __user *, param) ++{ ++ struct sched_param lp = { .sched_priority = 0 }; ++ struct task_struct *p; ++ int retval = -EINVAL; ++ ++ if (!param || pid < 0) ++ goto out_nounlock; ++ ++ rcu_read_lock(); ++ p = find_process_by_pid(pid); ++ retval = -ESRCH; ++ if (!p) ++ goto out_unlock; ++ ++ retval = security_task_getscheduler(p); ++ if (retval) ++ goto out_unlock; ++ ++ if (task_has_rt_policy(p)) ++ lp.sched_priority = p->rt_priority; ++ rcu_read_unlock(); ++ ++ /* ++ * This one might sleep, we cannot do it with a spinlock held ... ++ */ ++ retval = copy_to_user(param, &lp, sizeof(*param)) ? -EFAULT : 0; ++ ++out_nounlock: ++ return retval; ++ ++out_unlock: ++ rcu_read_unlock(); ++ return retval; ++} ++ ++/* ++ * Copy the kernel size attribute structure (which might be larger ++ * than what user-space knows about) to user-space. ++ * ++ * Note that all cases are valid: user-space buffer can be larger or ++ * smaller than the kernel-space buffer. The usual case is that both ++ * have the same size. ++ */ ++static int ++sched_attr_copy_to_user(struct sched_attr __user *uattr, ++ struct sched_attr *kattr, ++ unsigned int usize) ++{ ++ unsigned int ksize = sizeof(*kattr); ++ ++ if (!access_ok(uattr, usize)) ++ return -EFAULT; ++ ++ /* ++ * sched_getattr() ABI forwards and backwards compatibility: ++ * ++ * If usize == ksize then we just copy everything to user-space and all is good. ++ * ++ * If usize < ksize then we only copy as much as user-space has space for, ++ * this keeps ABI compatibility as well. We skip the rest. ++ * ++ * If usize > ksize then user-space is using a newer version of the ABI, ++ * which part the kernel doesn't know about. Just ignore it - tooling can ++ * detect the kernel's knowledge of attributes from the attr->size value ++ * which is set to ksize in this case. ++ */ ++ kattr->size = min(usize, ksize); ++ ++ if (copy_to_user(uattr, kattr, kattr->size)) ++ return -EFAULT; ++ ++ return 0; ++} ++ ++/** ++ * sys_sched_getattr - similar to sched_getparam, but with sched_attr ++ * @pid: the pid in question. ++ * @uattr: structure containing the extended parameters. ++ * @usize: sizeof(attr) for fwd/bwd comp. ++ * @flags: for future extension. ++ */ ++SYSCALL_DEFINE4(sched_getattr, pid_t, pid, struct sched_attr __user *, uattr, ++ unsigned int, usize, unsigned int, flags) ++{ ++ struct sched_attr kattr = { }; ++ struct task_struct *p; ++ int retval; ++ ++ if (!uattr || pid < 0 || usize > PAGE_SIZE || ++ usize < SCHED_ATTR_SIZE_VER0 || flags) ++ return -EINVAL; ++ ++ rcu_read_lock(); ++ p = find_process_by_pid(pid); ++ retval = -ESRCH; ++ if (!p) ++ goto out_unlock; ++ ++ retval = security_task_getscheduler(p); ++ if (retval) ++ goto out_unlock; ++ ++ kattr.sched_policy = p->policy; ++ if (p->sched_reset_on_fork) ++ kattr.sched_flags |= SCHED_FLAG_RESET_ON_FORK; ++ if (task_has_rt_policy(p)) ++ kattr.sched_priority = p->rt_priority; ++ else ++ kattr.sched_nice = task_nice(p); ++ ++#ifdef CONFIG_UCLAMP_TASK ++ kattr.sched_util_min = p->uclamp_req[UCLAMP_MIN].value; ++ kattr.sched_util_max = p->uclamp_req[UCLAMP_MAX].value; ++#endif ++ ++ rcu_read_unlock(); ++ ++ return sched_attr_copy_to_user(uattr, &kattr, usize); ++ ++out_unlock: ++ rcu_read_unlock(); ++ return retval; ++} ++ ++long sched_setaffinity(pid_t pid, const struct cpumask *in_mask) ++{ ++ cpumask_var_t cpus_allowed, new_mask; ++ struct task_struct *p; ++ int retval; ++ ++ get_online_cpus(); ++ rcu_read_lock(); ++ ++ p = find_process_by_pid(pid); ++ if (!p) { ++ rcu_read_unlock(); ++ put_online_cpus(); ++ return -ESRCH; ++ } ++ ++ /* Prevent p going away */ ++ get_task_struct(p); ++ rcu_read_unlock(); ++ ++ if (p->flags & PF_NO_SETAFFINITY) { ++ retval = -EINVAL; ++ goto out_put_task; ++ } ++ if (!alloc_cpumask_var(&cpus_allowed, GFP_KERNEL)) { ++ retval = -ENOMEM; ++ goto out_put_task; ++ } ++ if (!alloc_cpumask_var(&new_mask, GFP_KERNEL)) { ++ retval = -ENOMEM; ++ goto out_free_cpus_allowed; ++ } ++ retval = -EPERM; ++ if (!check_same_owner(p)) { ++ rcu_read_lock(); ++ if (!ns_capable(__task_cred(p)->user_ns, CAP_SYS_NICE)) { ++ rcu_read_unlock(); ++ goto out_unlock; ++ } ++ rcu_read_unlock(); ++ } ++ ++ retval = security_task_setscheduler(p); ++ if (retval) ++ goto out_unlock; ++ ++ cpuset_cpus_allowed(p, cpus_allowed); ++ cpumask_and(new_mask, in_mask, cpus_allowed); ++again: ++ retval = __set_cpus_allowed_ptr(p, new_mask, true); ++ ++ if (!retval) { ++ cpuset_cpus_allowed(p, cpus_allowed); ++ if (!cpumask_subset(new_mask, cpus_allowed)) { ++ /* ++ * We must have raced with a concurrent cpuset ++ * update. Just reset the cpus_allowed to the ++ * cpuset's cpus_allowed ++ */ ++ cpumask_copy(new_mask, cpus_allowed); ++ goto again; ++ } ++ } ++out_unlock: ++ free_cpumask_var(new_mask); ++out_free_cpus_allowed: ++ free_cpumask_var(cpus_allowed); ++out_put_task: ++ put_task_struct(p); ++ put_online_cpus(); ++ return retval; ++} ++ ++static int get_user_cpu_mask(unsigned long __user *user_mask_ptr, unsigned len, ++ struct cpumask *new_mask) ++{ ++ if (len < cpumask_size()) ++ cpumask_clear(new_mask); ++ else if (len > cpumask_size()) ++ len = cpumask_size(); ++ ++ return copy_from_user(new_mask, user_mask_ptr, len) ? -EFAULT : 0; ++} ++ ++/** ++ * sys_sched_setaffinity - set the CPU affinity of a process ++ * @pid: pid of the process ++ * @len: length in bytes of the bitmask pointed to by user_mask_ptr ++ * @user_mask_ptr: user-space pointer to the new CPU mask ++ * ++ * Return: 0 on success. An error code otherwise. ++ */ ++SYSCALL_DEFINE3(sched_setaffinity, pid_t, pid, unsigned int, len, ++ unsigned long __user *, user_mask_ptr) ++{ ++ cpumask_var_t new_mask; ++ int retval; ++ ++ if (!alloc_cpumask_var(&new_mask, GFP_KERNEL)) ++ return -ENOMEM; ++ ++ retval = get_user_cpu_mask(user_mask_ptr, len, new_mask); ++ if (retval == 0) ++ retval = sched_setaffinity(pid, new_mask); ++ free_cpumask_var(new_mask); ++ return retval; ++} ++ ++long sched_getaffinity(pid_t pid, cpumask_t *mask) ++{ ++ struct task_struct *p; ++ raw_spinlock_t *lock; ++ unsigned long flags; ++ int retval; ++ ++ rcu_read_lock(); ++ ++ retval = -ESRCH; ++ p = find_process_by_pid(pid); ++ if (!p) ++ goto out_unlock; ++ ++ retval = security_task_getscheduler(p); ++ if (retval) ++ goto out_unlock; ++ ++ task_access_lock_irqsave(p, &lock, &flags); ++ cpumask_and(mask, &p->cpus_mask, cpu_active_mask); ++ task_access_unlock_irqrestore(p, lock, &flags); ++ ++out_unlock: ++ rcu_read_unlock(); ++ ++ return retval; ++} ++ ++/** ++ * sys_sched_getaffinity - get the CPU affinity of a process ++ * @pid: pid of the process ++ * @len: length in bytes of the bitmask pointed to by user_mask_ptr ++ * @user_mask_ptr: user-space pointer to hold the current CPU mask ++ * ++ * Return: size of CPU mask copied to user_mask_ptr on success. An ++ * error code otherwise. ++ */ ++SYSCALL_DEFINE3(sched_getaffinity, pid_t, pid, unsigned int, len, ++ unsigned long __user *, user_mask_ptr) ++{ ++ int ret; ++ cpumask_var_t mask; ++ ++ if ((len * BITS_PER_BYTE) < nr_cpu_ids) ++ return -EINVAL; ++ if (len & (sizeof(unsigned long)-1)) ++ return -EINVAL; ++ ++ if (!alloc_cpumask_var(&mask, GFP_KERNEL)) ++ return -ENOMEM; ++ ++ ret = sched_getaffinity(pid, mask); ++ if (ret == 0) { ++ unsigned int retlen = min_t(size_t, len, cpumask_size()); ++ ++ if (copy_to_user(user_mask_ptr, mask, retlen)) ++ ret = -EFAULT; ++ else ++ ret = retlen; ++ } ++ free_cpumask_var(mask); ++ ++ return ret; ++} ++ ++/** ++ * sys_sched_yield - yield the current processor to other threads. ++ * ++ * This function yields the current CPU to other tasks. It does this by ++ * scheduling away the current task. If it still has the earliest deadline ++ * it will be scheduled again as the next task. ++ * ++ * Return: 0. ++ */ ++static void do_sched_yield(void) ++{ ++ struct rq *rq; ++ struct rq_flags rf; ++ ++ if (!sched_yield_type) ++ return; ++ ++ rq = this_rq_lock_irq(&rf); ++ ++ schedstat_inc(rq->yld_count); ++ ++ if (1 == sched_yield_type) { ++ if (!rt_task(current)) { ++ current->boost_prio = MAX_PRIORITY_ADJ; ++ requeue_task(current, rq); ++ } ++ } else if (2 == sched_yield_type) { ++ if (rq->nr_running > 1) ++ rq->skip = current; ++ } ++ ++ /* ++ * Since we are going to call schedule() anyway, there's ++ * no need to preempt or enable interrupts: ++ */ ++ preempt_disable(); ++ raw_spin_unlock(&rq->lock); ++ sched_preempt_enable_no_resched(); ++ ++ schedule(); ++} ++ ++SYSCALL_DEFINE0(sched_yield) ++{ ++ do_sched_yield(); ++ return 0; ++} ++ ++#ifndef CONFIG_PREEMPTION ++int __sched _cond_resched(void) ++{ ++ if (should_resched(0)) { ++ preempt_schedule_common(); ++ return 1; ++ } ++ rcu_all_qs(); ++ return 0; ++} ++EXPORT_SYMBOL(_cond_resched); ++#endif ++ ++/* ++ * __cond_resched_lock() - if a reschedule is pending, drop the given lock, ++ * call schedule, and on return reacquire the lock. ++ * ++ * This works OK both with and without CONFIG_PREEMPTION. We do strange low-level ++ * operations here to prevent schedule() from being called twice (once via ++ * spin_unlock(), once by hand). ++ */ ++int __cond_resched_lock(spinlock_t *lock) ++{ ++ int resched = should_resched(PREEMPT_LOCK_OFFSET); ++ int ret = 0; ++ ++ lockdep_assert_held(lock); ++ ++ if (spin_needbreak(lock) || resched) { ++ spin_unlock(lock); ++ if (resched) ++ preempt_schedule_common(); ++ else ++ cpu_relax(); ++ ret = 1; ++ spin_lock(lock); ++ } ++ return ret; ++} ++EXPORT_SYMBOL(__cond_resched_lock); ++ ++/** ++ * yield - yield the current processor to other threads. ++ * ++ * Do not ever use this function, there's a 99% chance you're doing it wrong. ++ * ++ * The scheduler is at all times free to pick the calling task as the most ++ * eligible task to run, if removing the yield() call from your code breaks ++ * it, its already broken. ++ * ++ * Typical broken usage is: ++ * ++ * while (!event) ++ * yield(); ++ * ++ * where one assumes that yield() will let 'the other' process run that will ++ * make event true. If the current task is a SCHED_FIFO task that will never ++ * happen. Never use yield() as a progress guarantee!! ++ * ++ * If you want to use yield() to wait for something, use wait_event(). ++ * If you want to use yield() to be 'nice' for others, use cond_resched(). ++ * If you still want to use yield(), do not! ++ */ ++void __sched yield(void) ++{ ++ set_current_state(TASK_RUNNING); ++ do_sched_yield(); ++} ++EXPORT_SYMBOL(yield); ++ ++/** ++ * yield_to - yield the current processor to another thread in ++ * your thread group, or accelerate that thread toward the ++ * processor it's on. ++ * @p: target task ++ * @preempt: whether task preemption is allowed or not ++ * ++ * It's the caller's job to ensure that the target task struct ++ * can't go away on us before we can do any checks. ++ * ++ * In BMQ, yield_to is not supported. ++ * ++ * Return: ++ * true (>0) if we indeed boosted the target task. ++ * false (0) if we failed to boost the target. ++ * -ESRCH if there's no task to yield to. ++ */ ++int __sched yield_to(struct task_struct *p, bool preempt) ++{ ++ return 0; ++} ++EXPORT_SYMBOL_GPL(yield_to); ++ ++int io_schedule_prepare(void) ++{ ++ int old_iowait = current->in_iowait; ++ ++ current->in_iowait = 1; ++ blk_schedule_flush_plug(current); ++ ++ return old_iowait; ++} ++ ++void io_schedule_finish(int token) ++{ ++ current->in_iowait = token; ++} ++ ++/* ++ * This task is about to go to sleep on IO. Increment rq->nr_iowait so ++ * that process accounting knows that this is a task in IO wait state. ++ * ++ * But don't do that if it is a deliberate, throttling IO wait (this task ++ * has set its backing_dev_info: the queue against which it should throttle) ++ */ ++ ++long __sched io_schedule_timeout(long timeout) ++{ ++ int token; ++ long ret; ++ ++ token = io_schedule_prepare(); ++ ret = schedule_timeout(timeout); ++ io_schedule_finish(token); ++ ++ return ret; ++} ++EXPORT_SYMBOL(io_schedule_timeout); ++ ++void __sched io_schedule(void) ++{ ++ int token; ++ ++ token = io_schedule_prepare(); ++ schedule(); ++ io_schedule_finish(token); ++} ++EXPORT_SYMBOL(io_schedule); ++ ++/** ++ * sys_sched_get_priority_max - return maximum RT priority. ++ * @policy: scheduling class. ++ * ++ * Return: On success, this syscall returns the maximum ++ * rt_priority that can be used by a given scheduling class. ++ * On failure, a negative error code is returned. ++ */ ++SYSCALL_DEFINE1(sched_get_priority_max, int, policy) ++{ ++ int ret = -EINVAL; ++ ++ switch (policy) { ++ case SCHED_FIFO: ++ case SCHED_RR: ++ ret = MAX_USER_RT_PRIO-1; ++ break; ++ case SCHED_NORMAL: ++ case SCHED_BATCH: ++ case SCHED_IDLE: ++ ret = 0; ++ break; ++ } ++ return ret; ++} ++ ++/** ++ * sys_sched_get_priority_min - return minimum RT priority. ++ * @policy: scheduling class. ++ * ++ * Return: On success, this syscall returns the minimum ++ * rt_priority that can be used by a given scheduling class. ++ * On failure, a negative error code is returned. ++ */ ++SYSCALL_DEFINE1(sched_get_priority_min, int, policy) ++{ ++ int ret = -EINVAL; ++ ++ switch (policy) { ++ case SCHED_FIFO: ++ case SCHED_RR: ++ ret = 1; ++ break; ++ case SCHED_NORMAL: ++ case SCHED_BATCH: ++ case SCHED_IDLE: ++ ret = 0; ++ break; ++ } ++ return ret; ++} ++ ++static int sched_rr_get_interval(pid_t pid, struct timespec64 *t) ++{ ++ struct task_struct *p; ++ int retval; ++ ++ if (pid < 0) ++ return -EINVAL; ++ ++ retval = -ESRCH; ++ rcu_read_lock(); ++ p = find_process_by_pid(pid); ++ if (!p) ++ goto out_unlock; ++ ++ retval = security_task_getscheduler(p); ++ if (retval) ++ goto out_unlock; ++ rcu_read_unlock(); ++ ++ *t = ns_to_timespec64(sched_timeslice_ns); ++ return 0; ++ ++out_unlock: ++ rcu_read_unlock(); ++ return retval; ++} ++ ++/** ++ * sys_sched_rr_get_interval - return the default timeslice of a process. ++ * @pid: pid of the process. ++ * @interval: userspace pointer to the timeslice value. ++ * ++ * ++ * Return: On success, 0 and the timeslice is in @interval. Otherwise, ++ * an error code. ++ */ ++SYSCALL_DEFINE2(sched_rr_get_interval, pid_t, pid, ++ struct __kernel_timespec __user *, interval) ++{ ++ struct timespec64 t; ++ int retval = sched_rr_get_interval(pid, &t); ++ ++ if (retval == 0) ++ retval = put_timespec64(&t, interval); ++ ++ return retval; ++} ++ ++#ifdef CONFIG_COMPAT_32BIT_TIME ++SYSCALL_DEFINE2(sched_rr_get_interval_time32, pid_t, pid, ++ struct old_timespec32 __user *, interval) ++{ ++ struct timespec64 t; ++ int retval = sched_rr_get_interval(pid, &t); ++ ++ if (retval == 0) ++ retval = put_old_timespec32(&t, interval); ++ return retval; ++} ++#endif ++ ++void sched_show_task(struct task_struct *p) ++{ ++ unsigned long free = 0; ++ int ppid; ++ ++ if (!try_get_task_stack(p)) ++ return; ++ ++ printk(KERN_INFO "%-15.15s %c", p->comm, task_state_to_char(p)); ++ ++ if (p->state == TASK_RUNNING) ++ printk(KERN_CONT " running task "); ++#ifdef CONFIG_DEBUG_STACK_USAGE ++ free = stack_not_used(p); ++#endif ++ ppid = 0; ++ rcu_read_lock(); ++ if (pid_alive(p)) ++ ppid = task_pid_nr(rcu_dereference(p->real_parent)); ++ rcu_read_unlock(); ++ printk(KERN_CONT "%5lu %5d %6d 0x%08lx\n", free, ++ task_pid_nr(p), ppid, ++ (unsigned long)task_thread_info(p)->flags); ++ ++ print_worker_info(KERN_INFO, p); ++ show_stack(p, NULL); ++ put_task_stack(p); ++} ++EXPORT_SYMBOL_GPL(sched_show_task); ++ ++static inline bool ++state_filter_match(unsigned long state_filter, struct task_struct *p) ++{ ++ /* no filter, everything matches */ ++ if (!state_filter) ++ return true; ++ ++ /* filter, but doesn't match */ ++ if (!(p->state & state_filter)) ++ return false; ++ ++ /* ++ * When looking for TASK_UNINTERRUPTIBLE skip TASK_IDLE (allows ++ * TASK_KILLABLE). ++ */ ++ if (state_filter == TASK_UNINTERRUPTIBLE && p->state == TASK_IDLE) ++ return false; ++ ++ return true; ++} ++ ++ ++void show_state_filter(unsigned long state_filter) ++{ ++ struct task_struct *g, *p; ++ ++#if BITS_PER_LONG == 32 ++ printk(KERN_INFO ++ " task PC stack pid father\n"); ++#else ++ printk(KERN_INFO ++ " task PC stack pid father\n"); ++#endif ++ rcu_read_lock(); ++ for_each_process_thread(g, p) { ++ /* ++ * reset the NMI-timeout, listing all files on a slow ++ * console might take a lot of time: ++ * Also, reset softlockup watchdogs on all CPUs, because ++ * another CPU might be blocked waiting for us to process ++ * an IPI. ++ */ ++ touch_nmi_watchdog(); ++ touch_all_softlockup_watchdogs(); ++ if (state_filter_match(state_filter, p)) ++ sched_show_task(p); ++ } ++ ++#ifdef CONFIG_SCHED_DEBUG ++ /* TODO: BMQ should support this ++ if (!state_filter) ++ sysrq_sched_debug_show(); ++ */ ++#endif ++ rcu_read_unlock(); ++ /* ++ * Only show locks if all tasks are dumped: ++ */ ++ if (!state_filter) ++ debug_show_all_locks(); ++} ++ ++void dump_cpu_task(int cpu) ++{ ++ pr_info("Task dump for CPU %d:\n", cpu); ++ sched_show_task(cpu_curr(cpu)); ++} ++ ++/** ++ * init_idle - set up an idle thread for a given CPU ++ * @idle: task in question ++ * @cpu: CPU the idle task belongs to ++ * ++ * NOTE: this function does not set the idle thread's NEED_RESCHED ++ * flag, to make booting more robust. ++ */ ++void init_idle(struct task_struct *idle, int cpu) ++{ ++ struct rq *rq = cpu_rq(cpu); ++ unsigned long flags; ++ ++ __sched_fork(0, idle); ++ ++ raw_spin_lock_irqsave(&idle->pi_lock, flags); ++ raw_spin_lock(&rq->lock); ++ update_rq_clock(rq); ++ ++ idle->last_ran = rq->clock_task; ++ idle->state = TASK_RUNNING; ++ idle->flags |= PF_IDLE; ++ /* Setting prio to illegal value shouldn't matter as it will never be de/enqueued */ ++ idle->prio = MAX_PRIO; ++ idle->bmq_idx = IDLE_TASK_SCHED_PRIO; ++ bmq_init_idle(&rq->queue, idle); ++ ++ kasan_unpoison_task_stack(idle); ++ ++#ifdef CONFIG_SMP ++ /* ++ * It's possible that init_idle() gets called multiple times on a task, ++ * in that case do_set_cpus_allowed() will not do the right thing. ++ * ++ * And since this is boot we can forgo the serialisation. ++ */ ++ set_cpus_allowed_common(idle, cpumask_of(cpu)); ++#endif ++ ++ /* Silence PROVE_RCU */ ++ rcu_read_lock(); ++ __set_task_cpu(idle, cpu); ++ rcu_read_unlock(); ++ ++ rq->idle = idle; ++ rcu_assign_pointer(rq->curr, idle); ++ idle->on_cpu = 1; ++ ++ raw_spin_unlock(&rq->lock); ++ raw_spin_unlock_irqrestore(&idle->pi_lock, flags); ++ ++ /* Set the preempt count _outside_ the spinlocks! */ ++ init_idle_preempt_count(idle, cpu); ++ ++ ftrace_graph_init_idle_task(idle, cpu); ++ vtime_init_idle(idle, cpu); ++#ifdef CONFIG_SMP ++ sprintf(idle->comm, "%s/%d", INIT_TASK_COMM, cpu); ++#endif ++} ++ ++#ifdef CONFIG_SMP ++ ++int cpuset_cpumask_can_shrink(const struct cpumask __maybe_unused *cur, ++ const struct cpumask __maybe_unused *trial) ++{ ++ return 1; ++} ++ ++int task_can_attach(struct task_struct *p, ++ const struct cpumask *cs_cpus_allowed) ++{ ++ int ret = 0; ++ ++ /* ++ * Kthreads which disallow setaffinity shouldn't be moved ++ * to a new cpuset; we don't want to change their CPU ++ * affinity and isolating such threads by their set of ++ * allowed nodes is unnecessary. Thus, cpusets are not ++ * applicable for such threads. This prevents checking for ++ * success of set_cpus_allowed_ptr() on all attached tasks ++ * before cpus_mask may be changed. ++ */ ++ if (p->flags & PF_NO_SETAFFINITY) ++ ret = -EINVAL; ++ ++ return ret; ++} ++ ++bool sched_smp_initialized __read_mostly; ++ ++#ifdef CONFIG_HOTPLUG_CPU ++/* ++ * Ensures that the idle task is using init_mm right before its CPU goes ++ * offline. ++ */ ++void idle_task_exit(void) ++{ ++ struct mm_struct *mm = current->active_mm; ++ ++ BUG_ON(cpu_online(smp_processor_id())); ++ ++ if (mm != &init_mm) { ++ switch_mm(mm, &init_mm, current); ++ current->active_mm = &init_mm; ++ finish_arch_post_lock_switch(); ++ } ++ mmdrop(mm); ++} ++ ++/* ++ * Migrate all tasks from the rq, sleeping tasks will be migrated by ++ * try_to_wake_up()->select_task_rq(). ++ * ++ * Called with rq->lock held even though we'er in stop_machine() and ++ * there's no concurrency possible, we hold the required locks anyway ++ * because of lock validation efforts. ++ */ ++static void migrate_tasks(struct rq *dead_rq) ++{ ++ struct rq *rq = dead_rq; ++ struct task_struct *p, *stop = rq->stop; ++ int count = 0; ++ ++ /* ++ * Fudge the rq selection such that the below task selection loop ++ * doesn't get stuck on the currently eligible stop task. ++ * ++ * We're currently inside stop_machine() and the rq is either stuck ++ * in the stop_machine_cpu_stop() loop, or we're executing this code, ++ * either way we should never end up calling schedule() until we're ++ * done here. ++ */ ++ rq->stop = NULL; ++ ++ p = rq_first_bmq_task(rq); ++ while (p != rq->idle) { ++ int dest_cpu; ++ ++ /* skip the running task */ ++ if (task_running(p) || 1 == p->nr_cpus_allowed) { ++ p = rq_next_bmq_task(p, rq); ++ continue; ++ } ++ ++ /* ++ * Rules for changing task_struct::cpus_allowed are holding ++ * both pi_lock and rq->lock, such that holding either ++ * stabilizes the mask. ++ * ++ * Drop rq->lock is not quite as disastrous as it usually is ++ * because !cpu_active at this point, which means load-balance ++ * will not interfere. Also, stop-machine. ++ */ ++ raw_spin_unlock(&rq->lock); ++ raw_spin_lock(&p->pi_lock); ++ raw_spin_lock(&rq->lock); ++ ++ /* ++ * Since we're inside stop-machine, _nothing_ should have ++ * changed the task, WARN if weird stuff happened, because in ++ * that case the above rq->lock drop is a fail too. ++ */ ++ if (WARN_ON(task_rq(p) != rq || !task_on_rq_queued(p))) { ++ raw_spin_unlock(&p->pi_lock); ++ p = rq_next_bmq_task(p, rq); ++ continue; ++ } ++ ++ count++; ++ /* Find suitable destination for @next, with force if needed. */ ++ dest_cpu = select_fallback_rq(dead_rq->cpu, p); ++ rq = __migrate_task(rq, p, dest_cpu); ++ raw_spin_unlock(&rq->lock); ++ raw_spin_unlock(&p->pi_lock); ++ ++ rq = dead_rq; ++ raw_spin_lock(&rq->lock); ++ /* Check queued task all over from the header again */ ++ p = rq_first_bmq_task(rq); ++ } ++ ++ rq->stop = stop; ++} ++ ++static void set_rq_offline(struct rq *rq) ++{ ++ if (rq->online) ++ rq->online = false; ++} ++#endif /* CONFIG_HOTPLUG_CPU */ ++ ++static void set_rq_online(struct rq *rq) ++{ ++ if (!rq->online) ++ rq->online = true; ++} ++ ++/* ++ * used to mark begin/end of suspend/resume: ++ */ ++static int num_cpus_frozen; ++ ++/* ++ * Update cpusets according to cpu_active mask. If cpusets are ++ * disabled, cpuset_update_active_cpus() becomes a simple wrapper ++ * around partition_sched_domains(). ++ * ++ * If we come here as part of a suspend/resume, don't touch cpusets because we ++ * want to restore it back to its original state upon resume anyway. ++ */ ++static void cpuset_cpu_active(void) ++{ ++ if (cpuhp_tasks_frozen) { ++ /* ++ * num_cpus_frozen tracks how many CPUs are involved in suspend ++ * resume sequence. As long as this is not the last online ++ * operation in the resume sequence, just build a single sched ++ * domain, ignoring cpusets. ++ */ ++ partition_sched_domains(1, NULL, NULL); ++ if (--num_cpus_frozen) ++ return; ++ /* ++ * This is the last CPU online operation. So fall through and ++ * restore the original sched domains by considering the ++ * cpuset configurations. ++ */ ++ cpuset_force_rebuild(); ++ } ++ ++ cpuset_update_active_cpus(); ++} ++ ++static int cpuset_cpu_inactive(unsigned int cpu) ++{ ++ if (!cpuhp_tasks_frozen) { ++ cpuset_update_active_cpus(); ++ } else { ++ num_cpus_frozen++; ++ partition_sched_domains(1, NULL, NULL); ++ } ++ return 0; ++} ++ ++int sched_cpu_activate(unsigned int cpu) ++{ ++ struct rq *rq = cpu_rq(cpu); ++ unsigned long flags; ++ ++#ifdef CONFIG_SCHED_SMT ++ /* ++ * When going up, increment the number of cores with SMT present. ++ */ ++ if (cpumask_weight(cpu_smt_mask(cpu)) == 2) ++ static_branch_inc_cpuslocked(&sched_smt_present); ++#endif ++ set_cpu_active(cpu, true); ++ ++ if (sched_smp_initialized) ++ cpuset_cpu_active(); ++ ++ /* ++ * Put the rq online, if not already. This happens: ++ * ++ * 1) In the early boot process, because we build the real domains ++ * after all cpus have been brought up. ++ * ++ * 2) At runtime, if cpuset_cpu_active() fails to rebuild the ++ * domains. ++ */ ++ raw_spin_lock_irqsave(&rq->lock, flags); ++ set_rq_online(rq); ++ raw_spin_unlock_irqrestore(&rq->lock, flags); ++ ++ return 0; ++} ++ ++int sched_cpu_deactivate(unsigned int cpu) ++{ ++ int ret; ++ ++ set_cpu_active(cpu, false); ++ /* ++ * We've cleared cpu_active_mask, wait for all preempt-disabled and RCU ++ * users of this state to go away such that all new such users will ++ * observe it. ++ * ++ * Do sync before park smpboot threads to take care the rcu boost case. ++ */ ++ synchronize_rcu(); ++ ++#ifdef CONFIG_SCHED_SMT ++ /* ++ * When going down, decrement the number of cores with SMT present. ++ */ ++ if (cpumask_weight(cpu_smt_mask(cpu)) == 2) { ++ static_branch_dec_cpuslocked(&sched_smt_present); ++ if (!static_branch_likely(&sched_smt_present)) ++ cpumask_clear(&sched_sg_idle_mask); ++ } ++#endif ++ ++ if (!sched_smp_initialized) ++ return 0; ++ ++ ret = cpuset_cpu_inactive(cpu); ++ if (ret) { ++ set_cpu_active(cpu, true); ++ return ret; ++ } ++ return 0; ++} ++ ++static void sched_rq_cpu_starting(unsigned int cpu) ++{ ++ struct rq *rq = cpu_rq(cpu); ++ ++ rq->calc_load_update = calc_load_update; ++} ++ ++int sched_cpu_starting(unsigned int cpu) ++{ ++ sched_rq_cpu_starting(cpu); ++ sched_tick_start(cpu); ++ return 0; ++} ++ ++#ifdef CONFIG_HOTPLUG_CPU ++int sched_cpu_dying(unsigned int cpu) ++{ ++ struct rq *rq = cpu_rq(cpu); ++ unsigned long flags; ++ ++ sched_tick_stop(cpu); ++ raw_spin_lock_irqsave(&rq->lock, flags); ++ set_rq_offline(rq); ++ migrate_tasks(rq); ++ raw_spin_unlock_irqrestore(&rq->lock, flags); ++ ++ hrtick_clear(rq); ++ return 0; ++} ++#endif ++ ++#ifdef CONFIG_SMP ++static void sched_init_topology_cpumask_early(void) ++{ ++ int cpu, level; ++ cpumask_t *tmp; ++ ++ for_each_possible_cpu(cpu) { ++ for (level = 0; level < NR_CPU_AFFINITY_CHK_LEVEL; level++) { ++ tmp = &(per_cpu(sched_cpu_affinity_masks, cpu)[level]); ++ cpumask_copy(tmp, cpu_possible_mask); ++ cpumask_clear_cpu(cpu, tmp); ++ } ++ per_cpu(sched_cpu_llc_mask, cpu) = ++ &(per_cpu(sched_cpu_affinity_masks, cpu)[0]); ++ per_cpu(sched_cpu_affinity_end_mask, cpu) = ++ &(per_cpu(sched_cpu_affinity_masks, cpu)[1]); ++ per_cpu(sd_llc_id, cpu) = cpu; ++ } ++} ++ ++#define TOPOLOGY_CPUMASK(name, func) \ ++ if (cpumask_and(chk, chk, func(cpu))) { \ ++ per_cpu(sched_cpu_llc_mask, cpu) = chk; \ ++ per_cpu(sd_llc_id, cpu) = cpumask_first(func(cpu)); \ ++ printk(KERN_INFO "bmq: cpu#%d affinity mask - "#name" 0x%08lx", \ ++ cpu, (chk++)->bits[0]); \ ++ } \ ++ cpumask_complement(chk, func(cpu)) ++ ++static void sched_init_topology_cpumask(void) ++{ ++ int cpu; ++ cpumask_t *chk; ++ ++ for_each_online_cpu(cpu) { ++ chk = &(per_cpu(sched_cpu_affinity_masks, cpu)[0]); ++ ++ cpumask_complement(chk, cpumask_of(cpu)); ++#ifdef CONFIG_SCHED_SMT ++ TOPOLOGY_CPUMASK(smt, topology_sibling_cpumask); ++#endif ++#ifdef CONFIG_SCHED_MC ++ TOPOLOGY_CPUMASK(coregroup, cpu_coregroup_mask); ++#endif ++ ++ TOPOLOGY_CPUMASK(core, topology_core_cpumask); ++ ++ if (cpumask_and(chk, chk, cpu_online_mask)) ++ printk(KERN_INFO "bmq: cpu#%d affinity mask - others 0x%08lx", ++ cpu, (chk++)->bits[0]); ++ ++ per_cpu(sched_cpu_affinity_end_mask, cpu) = chk; ++ printk(KERN_INFO "bmq: cpu#%d llc_id = %d, llc_mask idx = %ld\n", ++ cpu, per_cpu(sd_llc_id, cpu), ++ per_cpu(sched_cpu_llc_mask, cpu) - ++ &(per_cpu(sched_cpu_affinity_masks, cpu)[0])); ++ } ++} ++#endif ++ ++void __init sched_init_smp(void) ++{ ++ /* Move init over to a non-isolated CPU */ ++ if (set_cpus_allowed_ptr(current, housekeeping_cpumask(HK_FLAG_DOMAIN)) < 0) ++ BUG(); ++ ++ sched_init_topology_cpumask(); ++ ++ sched_smp_initialized = true; ++} ++#else ++void __init sched_init_smp(void) ++{ ++} ++#endif /* CONFIG_SMP */ ++ ++int in_sched_functions(unsigned long addr) ++{ ++ return in_lock_functions(addr) || ++ (addr >= (unsigned long)__sched_text_start ++ && addr < (unsigned long)__sched_text_end); ++} ++ ++#ifdef CONFIG_CGROUP_SCHED ++/* task group related information */ ++struct task_group { ++ struct cgroup_subsys_state css; ++ ++ struct rcu_head rcu; ++ struct list_head list; ++ ++ struct task_group *parent; ++ struct list_head siblings; ++ struct list_head children; ++}; ++ ++/* ++ * Default task group. ++ * Every task in system belongs to this group at bootup. ++ */ ++struct task_group root_task_group; ++LIST_HEAD(task_groups); ++ ++/* Cacheline aligned slab cache for task_group */ ++static struct kmem_cache *task_group_cache __read_mostly; ++#endif /* CONFIG_CGROUP_SCHED */ ++ ++void __init sched_init(void) ++{ ++ int i; ++ struct rq *rq; ++ ++ print_scheduler_version(); ++ ++ wait_bit_init(); ++ ++#ifdef CONFIG_SMP ++ for (i = 0; i < bmq_BITS; i++) ++ cpumask_copy(&sched_rq_watermark[i], cpu_present_mask); ++#endif ++ ++#ifdef CONFIG_CGROUP_SCHED ++ task_group_cache = KMEM_CACHE(task_group, 0); ++ ++ list_add(&root_task_group.list, &task_groups); ++ INIT_LIST_HEAD(&root_task_group.children); ++ INIT_LIST_HEAD(&root_task_group.siblings); ++#endif /* CONFIG_CGROUP_SCHED */ ++ for_each_possible_cpu(i) { ++ rq = cpu_rq(i); ++ ++ bmq_init(&rq->queue); ++ rq->watermark = IDLE_WM; ++ rq->skip = NULL; ++ ++ raw_spin_lock_init(&rq->lock); ++ rq->nr_running = rq->nr_uninterruptible = 0; ++ rq->calc_load_active = 0; ++ rq->calc_load_update = jiffies + LOAD_FREQ; ++#ifdef CONFIG_SMP ++ rq->online = false; ++ rq->cpu = i; ++ ++#ifdef CONFIG_SCHED_SMT ++ rq->active_balance = 0; ++#endif ++#endif ++ rq->nr_switches = 0; ++ atomic_set(&rq->nr_iowait, 0); ++ hrtick_rq_init(rq); ++ } ++#ifdef CONFIG_SMP ++ /* Set rq->online for cpu 0 */ ++ cpu_rq(0)->online = true; ++#endif ++ ++ /* ++ * The boot idle thread does lazy MMU switching as well: ++ */ ++ mmgrab(&init_mm); ++ enter_lazy_tlb(&init_mm, current); ++ ++ /* ++ * Make us the idle thread. Technically, schedule() should not be ++ * called from this thread, however somewhere below it might be, ++ * but because we are the idle thread, we just pick up running again ++ * when this runqueue becomes "idle". ++ */ ++ init_idle(current, smp_processor_id()); ++ ++ calc_load_update = jiffies + LOAD_FREQ; ++ ++#ifdef CONFIG_SMP ++ idle_thread_set_boot_cpu(); ++ ++ sched_init_topology_cpumask_early(); ++#endif /* SMP */ ++ ++ init_schedstats(); ++ ++ psi_init(); ++} ++ ++#ifdef CONFIG_DEBUG_ATOMIC_SLEEP ++static inline int preempt_count_equals(int preempt_offset) ++{ ++ int nested = preempt_count() + rcu_preempt_depth(); ++ ++ return (nested == preempt_offset); ++} ++ ++void __might_sleep(const char *file, int line, int preempt_offset) ++{ ++ /* ++ * Blocking primitives will set (and therefore destroy) current->state, ++ * since we will exit with TASK_RUNNING make sure we enter with it, ++ * otherwise we will destroy state. ++ */ ++ WARN_ONCE(current->state != TASK_RUNNING && current->task_state_change, ++ "do not call blocking ops when !TASK_RUNNING; " ++ "state=%lx set at [<%p>] %pS\n", ++ current->state, ++ (void *)current->task_state_change, ++ (void *)current->task_state_change); ++ ++ ___might_sleep(file, line, preempt_offset); ++} ++EXPORT_SYMBOL(__might_sleep); ++ ++void ___might_sleep(const char *file, int line, int preempt_offset) ++{ ++ /* Ratelimiting timestamp: */ ++ static unsigned long prev_jiffy; ++ ++ unsigned long preempt_disable_ip; ++ ++ /* WARN_ON_ONCE() by default, no rate limit required: */ ++ rcu_sleep_check(); ++ ++ if ((preempt_count_equals(preempt_offset) && !irqs_disabled() && ++ !is_idle_task(current) && !current->non_block_count) || ++ system_state == SYSTEM_BOOTING || system_state > SYSTEM_RUNNING || ++ oops_in_progress) ++ return; ++ if (time_before(jiffies, prev_jiffy + HZ) && prev_jiffy) ++ return; ++ prev_jiffy = jiffies; ++ ++ /* Save this before calling printk(), since that will clobber it: */ ++ preempt_disable_ip = get_preempt_disable_ip(current); ++ ++ printk(KERN_ERR ++ "BUG: sleeping function called from invalid context at %s:%d\n", ++ file, line); ++ printk(KERN_ERR ++ "in_atomic(): %d, irqs_disabled(): %d, non_block: %d, pid: %d, name: %s\n", ++ in_atomic(), irqs_disabled(), current->non_block_count, ++ current->pid, current->comm); ++ ++ if (task_stack_end_corrupted(current)) ++ printk(KERN_EMERG "Thread overran stack, or stack corrupted\n"); ++ ++ debug_show_held_locks(current); ++ if (irqs_disabled()) ++ print_irqtrace_events(current); ++#ifdef CONFIG_DEBUG_PREEMPT ++ if (!preempt_count_equals(preempt_offset)) { ++ pr_err("Preemption disabled at:"); ++ print_ip_sym(preempt_disable_ip); ++ pr_cont("\n"); ++ } ++#endif ++ dump_stack(); ++ add_taint(TAINT_WARN, LOCKDEP_STILL_OK); ++} ++EXPORT_SYMBOL(___might_sleep); ++ ++void __cant_sleep(const char *file, int line, int preempt_offset) ++{ ++ static unsigned long prev_jiffy; ++ ++ if (irqs_disabled()) ++ return; ++ ++ if (!IS_ENABLED(CONFIG_PREEMPT_COUNT)) ++ return; ++ ++ if (preempt_count() > preempt_offset) ++ return; ++ ++ if (time_before(jiffies, prev_jiffy + HZ) && prev_jiffy) ++ return; ++ prev_jiffy = jiffies; ++ ++ printk(KERN_ERR "BUG: assuming atomic context at %s:%d\n", file, line); ++ printk(KERN_ERR "in_atomic(): %d, irqs_disabled(): %d, pid: %d, name: %s\n", ++ in_atomic(), irqs_disabled(), ++ current->pid, current->comm); ++ ++ debug_show_held_locks(current); ++ dump_stack(); ++ add_taint(TAINT_WARN, LOCKDEP_STILL_OK); ++} ++EXPORT_SYMBOL_GPL(__cant_sleep); ++#endif ++ ++#ifdef CONFIG_MAGIC_SYSRQ ++void normalize_rt_tasks(void) ++{ ++ struct task_struct *g, *p; ++ struct sched_attr attr = { ++ .sched_policy = SCHED_NORMAL, ++ }; ++ ++ read_lock(&tasklist_lock); ++ for_each_process_thread(g, p) { ++ /* ++ * Only normalize user tasks: ++ */ ++ if (p->flags & PF_KTHREAD) ++ continue; ++ ++ if (!rt_task(p)) { ++ /* ++ * Renice negative nice level userspace ++ * tasks back to 0: ++ */ ++ if (task_nice(p) < 0) ++ set_user_nice(p, 0); ++ continue; ++ } ++ ++ __sched_setscheduler(p, &attr, false, false); ++ } ++ read_unlock(&tasklist_lock); ++} ++#endif /* CONFIG_MAGIC_SYSRQ */ ++ ++#if defined(CONFIG_IA64) || defined(CONFIG_KGDB_KDB) ++/* ++ * These functions are only useful for the IA64 MCA handling, or kdb. ++ * ++ * They can only be called when the whole system has been ++ * stopped - every CPU needs to be quiescent, and no scheduling ++ * activity can take place. Using them for anything else would ++ * be a serious bug, and as a result, they aren't even visible ++ * under any other configuration. ++ */ ++ ++/** ++ * curr_task - return the current task for a given CPU. ++ * @cpu: the processor in question. ++ * ++ * ONLY VALID WHEN THE WHOLE SYSTEM IS STOPPED! ++ * ++ * Return: The current task for @cpu. ++ */ ++struct task_struct *curr_task(int cpu) ++{ ++ return cpu_curr(cpu); ++} ++ ++#endif /* defined(CONFIG_IA64) || defined(CONFIG_KGDB_KDB) */ ++ ++#ifdef CONFIG_IA64 ++/** ++ * ia64_set_curr_task - set the current task for a given CPU. ++ * @cpu: the processor in question. ++ * @p: the task pointer to set. ++ * ++ * Description: This function must only be used when non-maskable interrupts ++ * are serviced on a separate stack. It allows the architecture to switch the ++ * notion of the current task on a CPU in a non-blocking manner. This function ++ * must be called with all CPU's synchronised, and interrupts disabled, the ++ * and caller must save the original value of the current task (see ++ * curr_task() above) and restore that value before reenabling interrupts and ++ * re-starting the system. ++ * ++ * ONLY VALID WHEN THE WHOLE SYSTEM IS STOPPED! ++ */ ++void ia64_set_curr_task(int cpu, struct task_struct *p) ++{ ++ cpu_curr(cpu) = p; ++} ++ ++#endif ++ ++#ifdef CONFIG_CGROUP_SCHED ++static void sched_free_group(struct task_group *tg) ++{ ++ kmem_cache_free(task_group_cache, tg); ++} ++ ++/* allocate runqueue etc for a new task group */ ++struct task_group *sched_create_group(struct task_group *parent) ++{ ++ struct task_group *tg; ++ ++ tg = kmem_cache_alloc(task_group_cache, GFP_KERNEL | __GFP_ZERO); ++ if (!tg) ++ return ERR_PTR(-ENOMEM); ++ ++ return tg; ++} ++ ++void sched_online_group(struct task_group *tg, struct task_group *parent) ++{ ++} ++ ++/* rcu callback to free various structures associated with a task group */ ++static void sched_free_group_rcu(struct rcu_head *rhp) ++{ ++ /* Now it should be safe to free those cfs_rqs */ ++ sched_free_group(container_of(rhp, struct task_group, rcu)); ++} ++ ++void sched_destroy_group(struct task_group *tg) ++{ ++ /* Wait for possible concurrent references to cfs_rqs complete */ ++ call_rcu(&tg->rcu, sched_free_group_rcu); ++} ++ ++void sched_offline_group(struct task_group *tg) ++{ ++} ++ ++static inline struct task_group *css_tg(struct cgroup_subsys_state *css) ++{ ++ return css ? container_of(css, struct task_group, css) : NULL; ++} ++ ++static struct cgroup_subsys_state * ++cpu_cgroup_css_alloc(struct cgroup_subsys_state *parent_css) ++{ ++ struct task_group *parent = css_tg(parent_css); ++ struct task_group *tg; ++ ++ if (!parent) { ++ /* This is early initialization for the top cgroup */ ++ return &root_task_group.css; ++ } ++ ++ tg = sched_create_group(parent); ++ if (IS_ERR(tg)) ++ return ERR_PTR(-ENOMEM); ++ return &tg->css; ++} ++ ++/* Expose task group only after completing cgroup initialization */ ++static int cpu_cgroup_css_online(struct cgroup_subsys_state *css) ++{ ++ struct task_group *tg = css_tg(css); ++ struct task_group *parent = css_tg(css->parent); ++ ++ if (parent) ++ sched_online_group(tg, parent); ++ return 0; ++} ++ ++static void cpu_cgroup_css_released(struct cgroup_subsys_state *css) ++{ ++ struct task_group *tg = css_tg(css); ++ ++ sched_offline_group(tg); ++} ++ ++static void cpu_cgroup_css_free(struct cgroup_subsys_state *css) ++{ ++ struct task_group *tg = css_tg(css); ++ ++ /* ++ * Relies on the RCU grace period between css_released() and this. ++ */ ++ sched_free_group(tg); ++} ++ ++static void cpu_cgroup_fork(struct task_struct *task) ++{ ++} ++ ++static int cpu_cgroup_can_attach(struct cgroup_taskset *tset) ++{ ++ return 0; ++} ++ ++static void cpu_cgroup_attach(struct cgroup_taskset *tset) ++{ ++} ++ ++static struct cftype cpu_legacy_files[] = { ++ { } /* Terminate */ ++}; ++ ++static struct cftype cpu_files[] = { ++ { } /* terminate */ ++}; ++ ++static int cpu_extra_stat_show(struct seq_file *sf, ++ struct cgroup_subsys_state *css) ++{ ++ return 0; ++} ++ ++struct cgroup_subsys cpu_cgrp_subsys = { ++ .css_alloc = cpu_cgroup_css_alloc, ++ .css_online = cpu_cgroup_css_online, ++ .css_released = cpu_cgroup_css_released, ++ .css_free = cpu_cgroup_css_free, ++ .css_extra_stat_show = cpu_extra_stat_show, ++ .fork = cpu_cgroup_fork, ++ .can_attach = cpu_cgroup_can_attach, ++ .attach = cpu_cgroup_attach, ++ .legacy_cftypes = cpu_files, ++ .legacy_cftypes = cpu_legacy_files, ++ .dfl_cftypes = cpu_files, ++ .early_init = true, ++ .threaded = true, ++}; ++#endif /* CONFIG_CGROUP_SCHED */ ++ ++#undef CREATE_TRACE_POINTS +diff --git a/kernel/sched/bmq_debug.c b/kernel/sched/bmq_debug.c +new file mode 100644 +index 000000000000..375a1a805d86 +--- /dev/null ++++ b/kernel/sched/bmq_debug.c +@@ -0,0 +1,31 @@ ++/* ++ * kernel/sched/bmq_debug.c ++ * ++ * Print the BMQ debugging details ++ * ++ * Author: Alfred Chen ++ * Date : 2020 ++ */ ++#include "bmq_sched.h" ++ ++/* ++ * This allows printing both to /proc/sched_debug and ++ * to the console ++ */ ++#define SEQ_printf(m, x...) \ ++ do { \ ++ if (m) \ ++ seq_printf(m, x); \ ++ else \ ++ pr_cont(x); \ ++ } while (0) ++ ++void proc_sched_show_task(struct task_struct *p, struct pid_namespace *ns, ++ struct seq_file *m) ++{ ++ SEQ_printf(m, "%s (%d, #threads: %d)\n", p->comm, task_pid_nr_ns(p, ns), ++ get_nr_threads(p)); ++} ++ ++void proc_sched_set_task(struct task_struct *p) ++{} +diff --git a/kernel/sched/bmq_sched.h b/kernel/sched/bmq_sched.h +new file mode 100644 +index 000000000000..449d6b54a253 +--- /dev/null ++++ b/kernel/sched/bmq_sched.h +@@ -0,0 +1,509 @@ ++#ifndef BMQ_SCHED_H ++#define BMQ_SCHED_H ++ ++#include ++ ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++ ++#include ++ ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++ ++#include ++ ++#ifdef CONFIG_PARAVIRT ++# include ++#endif ++ ++#include "cpupri.h" ++ ++/* task_struct::on_rq states: */ ++#define TASK_ON_RQ_QUEUED 1 ++#define TASK_ON_RQ_MIGRATING 2 ++ ++static inline int task_on_rq_queued(struct task_struct *p) ++{ ++ return p->on_rq == TASK_ON_RQ_QUEUED; ++} ++ ++static inline int task_on_rq_migrating(struct task_struct *p) ++{ ++ return READ_ONCE(p->on_rq) == TASK_ON_RQ_MIGRATING; ++} ++ ++/* ++ * wake flags ++ */ ++#define WF_SYNC 0x01 /* waker goes to sleep after wakeup */ ++#define WF_FORK 0x02 /* child wakeup after fork */ ++#define WF_MIGRATED 0x04 /* internal use, task got migrated */ ++ ++/* bits: ++ * RT, Low prio adj range, nice width, high prio adj range, cpu idle task */ ++#define bmq_BITS (NICE_WIDTH + 2 * MAX_PRIORITY_ADJ + 2) ++#define IDLE_TASK_SCHED_PRIO (bmq_BITS - 1) ++ ++struct bmq { ++ DECLARE_BITMAP(bitmap, bmq_BITS); ++ struct list_head heads[bmq_BITS]; ++}; ++ ++/* ++ * This is the main, per-CPU runqueue data structure. ++ * This data should only be modified by the local cpu. ++ */ ++struct rq { ++ /* runqueue lock: */ ++ raw_spinlock_t lock; ++ ++ struct task_struct *curr, *idle, *stop, *skip; ++ struct mm_struct *prev_mm; ++ ++ struct bmq queue; ++ unsigned long watermark; ++ ++ /* switch count */ ++ u64 nr_switches; ++ ++ atomic_t nr_iowait; ++ ++#ifdef CONFIG_MEMBARRIER ++ int membarrier_state; ++#endif ++ ++#ifdef CONFIG_SMP ++ int cpu; /* cpu of this runqueue */ ++ bool online; ++ ++#ifdef CONFIG_HAVE_SCHED_AVG_IRQ ++ struct sched_avg avg_irq; ++#endif ++ ++#ifdef CONFIG_SCHED_SMT ++ int active_balance; ++ struct cpu_stop_work active_balance_work; ++#endif ++#endif /* CONFIG_SMP */ ++#ifdef CONFIG_IRQ_TIME_ACCOUNTING ++ u64 prev_irq_time; ++#endif /* CONFIG_IRQ_TIME_ACCOUNTING */ ++#ifdef CONFIG_PARAVIRT ++ u64 prev_steal_time; ++#endif /* CONFIG_PARAVIRT */ ++#ifdef CONFIG_PARAVIRT_TIME_ACCOUNTING ++ u64 prev_steal_time_rq; ++#endif /* CONFIG_PARAVIRT_TIME_ACCOUNTING */ ++ ++ /* calc_load related fields */ ++ unsigned long calc_load_update; ++ long calc_load_active; ++ ++ u64 clock, last_tick; ++ u64 last_ts_switch; ++ u64 clock_task; ++ ++ unsigned long nr_running; ++ unsigned long nr_uninterruptible; ++ ++#ifdef CONFIG_SCHED_HRTICK ++#ifdef CONFIG_SMP ++ int hrtick_csd_pending; ++ call_single_data_t hrtick_csd; ++#endif ++ struct hrtimer hrtick_timer; ++#endif ++ ++#ifdef CONFIG_SCHEDSTATS ++ ++ /* latency stats */ ++ struct sched_info rq_sched_info; ++ unsigned long long rq_cpu_time; ++ /* could above be rq->cfs_rq.exec_clock + rq->rt_rq.rt_runtime ? */ ++ ++ /* sys_sched_yield() stats */ ++ unsigned int yld_count; ++ ++ /* schedule() stats */ ++ unsigned int sched_switch; ++ unsigned int sched_count; ++ unsigned int sched_goidle; ++ ++ /* try_to_wake_up() stats */ ++ unsigned int ttwu_count; ++ unsigned int ttwu_local; ++#endif /* CONFIG_SCHEDSTATS */ ++#ifdef CONFIG_CPU_IDLE ++ /* Must be inspected within a rcu lock section */ ++ struct cpuidle_state *idle_state; ++#endif ++}; ++ ++extern unsigned long calc_load_update; ++extern atomic_long_t calc_load_tasks; ++ ++extern void calc_global_load_tick(struct rq *this_rq); ++extern long calc_load_fold_active(struct rq *this_rq, long adjust); ++ ++DECLARE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues); ++#define cpu_rq(cpu) (&per_cpu(runqueues, (cpu))) ++#define this_rq() this_cpu_ptr(&runqueues) ++#define task_rq(p) cpu_rq(task_cpu(p)) ++#define cpu_curr(cpu) (cpu_rq(cpu)->curr) ++#define raw_rq() raw_cpu_ptr(&runqueues) ++ ++#ifdef CONFIG_SMP ++#if defined(CONFIG_SCHED_DEBUG) && defined(CONFIG_SYSCTL) ++void register_sched_domain_sysctl(void); ++void unregister_sched_domain_sysctl(void); ++#else ++static inline void register_sched_domain_sysctl(void) ++{ ++} ++static inline void unregister_sched_domain_sysctl(void) ++{ ++} ++#endif ++ ++extern bool sched_smp_initialized; ++ ++enum { ++ BASE_CPU_AFFINITY_CHK_LEVEL = 1, ++#ifdef CONFIG_SCHED_SMT ++ SMT_CPU_AFFINITY_CHK_LEVEL_SPACE_HOLDER, ++#endif ++#ifdef CONFIG_SCHED_MC ++ MC_CPU_AFFINITY_CHK_LEVEL_SPACE_HOLDER, ++#endif ++ NR_CPU_AFFINITY_CHK_LEVEL ++}; ++ ++DECLARE_PER_CPU(cpumask_t [NR_CPU_AFFINITY_CHK_LEVEL], sched_cpu_affinity_masks); ++ ++static inline int __best_mask_cpu(int cpu, const cpumask_t *cpumask, ++ const cpumask_t *mask) ++{ ++ while ((cpu = cpumask_any_and(cpumask, mask)) >= nr_cpu_ids) ++ mask++; ++ return cpu; ++} ++ ++static inline int best_mask_cpu(int cpu, const cpumask_t *cpumask) ++{ ++ return cpumask_test_cpu(cpu, cpumask)? cpu : ++ __best_mask_cpu(cpu, cpumask, &(per_cpu(sched_cpu_affinity_masks, cpu)[0])); ++} ++ ++#endif /* CONFIG_SMP */ ++ ++#ifndef arch_scale_freq_capacity ++static __always_inline ++unsigned long arch_scale_freq_capacity(int cpu) ++{ ++ return SCHED_CAPACITY_SCALE; ++} ++#endif ++ ++static inline u64 __rq_clock_broken(struct rq *rq) ++{ ++ return READ_ONCE(rq->clock); ++} ++ ++static inline u64 rq_clock(struct rq *rq) ++{ ++ /* ++ * Relax lockdep_assert_held() checking as in VRQ, call to ++ * sched_info_xxxx() may not held rq->lock ++ * lockdep_assert_held(&rq->lock); ++ */ ++ return rq->clock; ++} ++ ++static inline u64 rq_clock_task(struct rq *rq) ++{ ++ /* ++ * Relax lockdep_assert_held() checking as in VRQ, call to ++ * sched_info_xxxx() may not held rq->lock ++ * lockdep_assert_held(&rq->lock); ++ */ ++ return rq->clock_task; ++} ++ ++/* ++ * {de,en}queue flags: ++ * ++ * DEQUEUE_SLEEP - task is no longer runnable ++ * ENQUEUE_WAKEUP - task just became runnable ++ * ++ */ ++ ++#define DEQUEUE_SLEEP 0x01 ++ ++#define ENQUEUE_WAKEUP 0x01 ++ ++ ++/* ++ * Below are scheduler API which using in other kernel code ++ * It use the dummy rq_flags ++ * ToDo : BMQ need to support these APIs for compatibility with mainline ++ * scheduler code. ++ */ ++struct rq_flags { ++ unsigned long flags; ++}; ++ ++struct rq *__task_rq_lock(struct task_struct *p, struct rq_flags *rf) ++ __acquires(rq->lock); ++ ++struct rq *task_rq_lock(struct task_struct *p, struct rq_flags *rf) ++ __acquires(p->pi_lock) ++ __acquires(rq->lock); ++ ++static inline void __task_rq_unlock(struct rq *rq, struct rq_flags *rf) ++ __releases(rq->lock) ++{ ++ raw_spin_unlock(&rq->lock); ++} ++ ++static inline void ++task_rq_unlock(struct rq *rq, struct task_struct *p, struct rq_flags *rf) ++ __releases(rq->lock) ++ __releases(p->pi_lock) ++{ ++ raw_spin_unlock(&rq->lock); ++ raw_spin_unlock_irqrestore(&p->pi_lock, rf->flags); ++} ++ ++static inline void ++rq_unlock_irq(struct rq *rq, struct rq_flags *rf) ++ __releases(rq->lock) ++{ ++ raw_spin_unlock_irq(&rq->lock); ++} ++ ++static inline struct rq * ++this_rq_lock_irq(struct rq_flags *rf) ++ __acquires(rq->lock) ++{ ++ struct rq *rq; ++ ++ local_irq_disable(); ++ rq = this_rq(); ++ raw_spin_lock(&rq->lock); ++ ++ return rq; ++} ++ ++static inline bool task_running(struct task_struct *p) ++{ ++ return p->on_cpu; ++} ++ ++extern struct static_key_false sched_schedstats; ++ ++static inline void sched_ttwu_pending(void) { } ++ ++#ifdef CONFIG_CPU_IDLE ++static inline void idle_set_state(struct rq *rq, ++ struct cpuidle_state *idle_state) ++{ ++ rq->idle_state = idle_state; ++} ++ ++static inline struct cpuidle_state *idle_get_state(struct rq *rq) ++{ ++ WARN_ON(!rcu_read_lock_held()); ++ return rq->idle_state; ++} ++#else ++static inline void idle_set_state(struct rq *rq, ++ struct cpuidle_state *idle_state) ++{ ++} ++ ++static inline struct cpuidle_state *idle_get_state(struct rq *rq) ++{ ++ return NULL; ++} ++#endif ++ ++static inline int cpu_of(const struct rq *rq) ++{ ++#ifdef CONFIG_SMP ++ return rq->cpu; ++#else ++ return 0; ++#endif ++} ++ ++#include "stats.h" ++ ++#ifdef CONFIG_IRQ_TIME_ACCOUNTING ++struct irqtime { ++ u64 total; ++ u64 tick_delta; ++ u64 irq_start_time; ++ struct u64_stats_sync sync; ++}; ++ ++DECLARE_PER_CPU(struct irqtime, cpu_irqtime); ++ ++/* ++ * Returns the irqtime minus the softirq time computed by ksoftirqd. ++ * Otherwise ksoftirqd's sum_exec_runtime is substracted its own runtime ++ * and never move forward. ++ */ ++static inline u64 irq_time_read(int cpu) ++{ ++ struct irqtime *irqtime = &per_cpu(cpu_irqtime, cpu); ++ unsigned int seq; ++ u64 total; ++ ++ do { ++ seq = __u64_stats_fetch_begin(&irqtime->sync); ++ total = irqtime->total; ++ } while (__u64_stats_fetch_retry(&irqtime->sync, seq)); ++ ++ return total; ++} ++#endif /* CONFIG_IRQ_TIME_ACCOUNTING */ ++ ++#ifdef CONFIG_CPU_FREQ ++DECLARE_PER_CPU(struct update_util_data __rcu *, cpufreq_update_util_data); ++ ++/** ++ * cpufreq_update_util - Take a note about CPU utilization changes. ++ * @rq: Runqueue to carry out the update for. ++ * @flags: Update reason flags. ++ * ++ * This function is called by the scheduler on the CPU whose utilization is ++ * being updated. ++ * ++ * It can only be called from RCU-sched read-side critical sections. ++ * ++ * The way cpufreq is currently arranged requires it to evaluate the CPU ++ * performance state (frequency/voltage) on a regular basis to prevent it from ++ * being stuck in a completely inadequate performance level for too long. ++ * That is not guaranteed to happen if the updates are only triggered from CFS ++ * and DL, though, because they may not be coming in if only RT tasks are ++ * active all the time (or there are RT tasks only). ++ * ++ * As a workaround for that issue, this function is called periodically by the ++ * RT sched class to trigger extra cpufreq updates to prevent it from stalling, ++ * but that really is a band-aid. Going forward it should be replaced with ++ * solutions targeted more specifically at RT tasks. ++ */ ++static inline void cpufreq_update_util(struct rq *rq, unsigned int flags) ++{ ++ struct update_util_data *data; ++ ++ data = rcu_dereference_sched(*this_cpu_ptr(&cpufreq_update_util_data)); ++ if (data) ++ data->func(data, rq_clock(rq), flags); ++} ++#else ++static inline void cpufreq_update_util(struct rq *rq, unsigned int flags) {} ++#endif /* CONFIG_CPU_FREQ */ ++ ++#ifdef CONFIG_NO_HZ_FULL ++extern int __init sched_tick_offload_init(void); ++#else ++static inline int sched_tick_offload_init(void) { return 0; } ++#endif ++ ++#ifdef arch_scale_freq_capacity ++#ifndef arch_scale_freq_invariant ++#define arch_scale_freq_invariant() (true) ++#endif ++#else /* arch_scale_freq_capacity */ ++#define arch_scale_freq_invariant() (false) ++#endif ++ ++extern void schedule_idle(void); ++ ++/* ++ * !! For sched_setattr_nocheck() (kernel) only !! ++ * ++ * This is actually gross. :( ++ * ++ * It is used to make schedutil kworker(s) higher priority than SCHED_DEADLINE ++ * tasks, but still be able to sleep. We need this on platforms that cannot ++ * atomically change clock frequency. Remove once fast switching will be ++ * available on such platforms. ++ * ++ * SUGOV stands for SchedUtil GOVernor. ++ */ ++#define SCHED_FLAG_SUGOV 0x10000000 ++ ++#ifdef CONFIG_MEMBARRIER ++/* ++ * The scheduler provides memory barriers required by membarrier between: ++ * - prior user-space memory accesses and store to rq->membarrier_state, ++ * - store to rq->membarrier_state and following user-space memory accesses. ++ * In the same way it provides those guarantees around store to rq->curr. ++ */ ++static inline void membarrier_switch_mm(struct rq *rq, ++ struct mm_struct *prev_mm, ++ struct mm_struct *next_mm) ++{ ++ int membarrier_state; ++ ++ if (prev_mm == next_mm) ++ return; ++ ++ membarrier_state = atomic_read(&next_mm->membarrier_state); ++ if (READ_ONCE(rq->membarrier_state) == membarrier_state) ++ return; ++ ++ WRITE_ONCE(rq->membarrier_state, membarrier_state); ++} ++#else ++static inline void membarrier_switch_mm(struct rq *rq, ++ struct mm_struct *prev_mm, ++ struct mm_struct *next_mm) ++{ ++} ++#endif ++ ++static inline int task_running_nice(struct task_struct *p) ++{ ++ return (p->prio + p->boost_prio > DEFAULT_PRIO + MAX_PRIORITY_ADJ); ++} ++ ++#ifdef CONFIG_NUMA ++extern int sched_numa_find_closest(const struct cpumask *cpus, int cpu); ++#else ++static inline int sched_numa_find_closest(const struct cpumask *cpus, int cpu) ++{ ++ return nr_cpu_ids; ++} ++#endif ++#endif /* BMQ_SCHED_H */ +diff --git a/kernel/sched/cpufreq_schedutil.c b/kernel/sched/cpufreq_schedutil.c +index 9b8916fd00a2..9073fba046c8 100644 +--- a/kernel/sched/cpufreq_schedutil.c ++++ b/kernel/sched/cpufreq_schedutil.c +@@ -183,6 +183,7 @@ static unsigned int get_next_freq(struct sugov_policy *sg_policy, + return cpufreq_driver_resolve_freq(policy, freq); + } + ++#ifndef CONFIG_SCHED_BMQ + /* + * This function computes an effective utilization for the given CPU, to be + * used for frequency selection given the linear relation: f = u * f_max. +@@ -300,6 +301,13 @@ static unsigned long sugov_get_util(struct sugov_cpu *sg_cpu) + + return schedutil_cpu_util(sg_cpu->cpu, util, max, FREQUENCY_UTIL, NULL); + } ++#else /* CONFIG_SCHED_BMQ */ ++static unsigned long sugov_get_util(struct sugov_cpu *sg_cpu) ++{ ++ sg_cpu->max = arch_scale_cpu_capacity(sg_cpu->cpu); ++ return sg_cpu->max; ++} ++#endif + + /** + * sugov_iowait_reset() - Reset the IO boost status of a CPU. +@@ -443,7 +451,9 @@ static inline bool sugov_cpu_is_busy(struct sugov_cpu *sg_cpu) { return false; } + */ + static inline void ignore_dl_rate_limit(struct sugov_cpu *sg_cpu, struct sugov_policy *sg_policy) + { ++#ifndef CONFIG_SCHED_BMQ + if (cpu_bw_dl(cpu_rq(sg_cpu->cpu)) > sg_cpu->bw_dl) ++#endif + sg_policy->limits_changed = true; + } + +@@ -686,6 +696,7 @@ static int sugov_kthread_create(struct sugov_policy *sg_policy) + } + + ret = sched_setattr_nocheck(thread, &attr); ++ + if (ret) { + kthread_stop(thread); + pr_warn("%s: failed to set SCHED_DEADLINE\n", __func__); +@@ -916,6 +927,7 @@ static int __init sugov_register(void) + core_initcall(sugov_register); + + #ifdef CONFIG_ENERGY_MODEL ++#ifndef CONFIG_SCHED_BMQ + extern bool sched_energy_update; + extern struct mutex sched_energy_mutex; + +@@ -946,4 +958,10 @@ void sched_cpufreq_governor_change(struct cpufreq_policy *policy, + } + + } ++#else /* CONFIG_SCHED_BMQ */ ++void sched_cpufreq_governor_change(struct cpufreq_policy *policy, ++ struct cpufreq_governor *old_gov) ++{ ++} ++#endif + #endif +diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c +index d43318a489f2..1a312bb6f4a1 100644 +--- a/kernel/sched/cputime.c ++++ b/kernel/sched/cputime.c +@@ -122,7 +122,7 @@ void account_user_time(struct task_struct *p, u64 cputime) + p->utime += cputime; + account_group_user_time(p, cputime); + +- index = (task_nice(p) > 0) ? CPUTIME_NICE : CPUTIME_USER; ++ index = task_running_nice(p) ? CPUTIME_NICE : CPUTIME_USER; + + /* Add user time to cpustat. */ + task_group_account_field(p, index, cputime); +@@ -146,7 +146,7 @@ void account_guest_time(struct task_struct *p, u64 cputime) + p->gtime += cputime; + + /* Add guest time to cpustat. */ +- if (task_nice(p) > 0) { ++ if (task_running_nice(p)) { + cpustat[CPUTIME_NICE] += cputime; + cpustat[CPUTIME_GUEST_NICE] += cputime; + } else { +@@ -269,7 +269,7 @@ static inline u64 account_other_time(u64 max) + #ifdef CONFIG_64BIT + static inline u64 read_sum_exec_runtime(struct task_struct *t) + { +- return t->se.sum_exec_runtime; ++ return tsk_seruntime(t); + } + #else + static u64 read_sum_exec_runtime(struct task_struct *t) +@@ -279,7 +279,7 @@ static u64 read_sum_exec_runtime(struct task_struct *t) + struct rq *rq; + + rq = task_rq_lock(t, &rf); +- ns = t->se.sum_exec_runtime; ++ ns = tsk_seruntime(t); + task_rq_unlock(rq, t, &rf); + + return ns; +@@ -661,7 +661,7 @@ void cputime_adjust(struct task_cputime *curr, struct prev_cputime *prev, + void task_cputime_adjusted(struct task_struct *p, u64 *ut, u64 *st) + { + struct task_cputime cputime = { +- .sum_exec_runtime = p->se.sum_exec_runtime, ++ .sum_exec_runtime = tsk_seruntime(p), + }; + + task_cputime(p, &cputime.utime, &cputime.stime); +diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c +index ffa959e91227..469f36c89a9d 100644 +--- a/kernel/sched/idle.c ++++ b/kernel/sched/idle.c +@@ -361,6 +361,7 @@ void cpu_startup_entry(enum cpuhp_state state) + do_idle(); + } + ++#ifndef CONFIG_SCHED_BMQ + /* + * idle-task scheduling class. + */ +@@ -481,3 +482,4 @@ const struct sched_class idle_sched_class = { + .switched_to = switched_to_idle, + .update_curr = update_curr_idle, + }; ++#endif +diff --git a/kernel/sched/pelt.c b/kernel/sched/pelt.c +index a96db50d40e0..22c20e28b613 100644 +--- a/kernel/sched/pelt.c ++++ b/kernel/sched/pelt.c +@@ -236,6 +236,7 @@ ___update_load_avg(struct sched_avg *sa, unsigned long load, unsigned long runna + WRITE_ONCE(sa->util_avg, sa->util_sum / divider); + } + ++#ifndef CONFIG_SCHED_BMQ + /* + * sched_entity: + * +@@ -352,6 +353,7 @@ int update_dl_rq_load_avg(u64 now, struct rq *rq, int running) + + return 0; + } ++#endif + + #ifdef CONFIG_HAVE_SCHED_AVG_IRQ + /* +diff --git a/kernel/sched/pelt.h b/kernel/sched/pelt.h +index afff644da065..4da52afaeff8 100644 +--- a/kernel/sched/pelt.h ++++ b/kernel/sched/pelt.h +@@ -1,11 +1,13 @@ + #ifdef CONFIG_SMP + #include "sched-pelt.h" + ++#ifndef CONFIG_SCHED_BMQ + int __update_load_avg_blocked_se(u64 now, struct sched_entity *se); + int __update_load_avg_se(u64 now, struct cfs_rq *cfs_rq, struct sched_entity *se); + int __update_load_avg_cfs_rq(u64 now, struct cfs_rq *cfs_rq); + int update_rt_rq_load_avg(u64 now, struct rq *rq, int running); + int update_dl_rq_load_avg(u64 now, struct rq *rq, int running); ++#endif + + #ifdef CONFIG_HAVE_SCHED_AVG_IRQ + int update_irq_load_avg(struct rq *rq, u64 running); +@@ -17,6 +19,7 @@ update_irq_load_avg(struct rq *rq, u64 running) + } + #endif + ++#ifndef CONFIG_SCHED_BMQ + /* + * When a task is dequeued, its estimated utilization should not be update if + * its util_avg has not been updated at least once. +@@ -137,9 +140,11 @@ static inline u64 cfs_rq_clock_pelt(struct cfs_rq *cfs_rq) + return rq_clock_pelt(rq_of(cfs_rq)); + } + #endif ++#endif /* CONFIG_SCHED_BMQ */ + + #else + ++#ifndef CONFIG_SCHED_BMQ + static inline int + update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq) + { +@@ -157,6 +162,7 @@ update_dl_rq_load_avg(u64 now, struct rq *rq, int running) + { + return 0; + } ++#endif + + static inline int + update_irq_load_avg(struct rq *rq, u64 running) +diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h +index 280a3c735935..db07c37806bc 100644 +--- a/kernel/sched/sched.h ++++ b/kernel/sched/sched.h +@@ -2,6 +2,10 @@ + /* + * Scheduler internal types and methods: + */ ++#ifdef CONFIG_SCHED_BMQ ++#include "bmq_sched.h" ++#else ++ + #include + + #include +@@ -2487,3 +2491,9 @@ static inline void membarrier_switch_mm(struct rq *rq, + { + } + #endif ++ ++static inline int task_running_nice(struct task_struct *p) ++{ ++ return (task_nice(p) > 0); ++} ++#endif /* !CONFIG_SCHED_BMQ */ +diff --git a/kernel/sched/stats.c b/kernel/sched/stats.c +index 750fb3c67eed..0cc040a28d3f 100644 +--- a/kernel/sched/stats.c ++++ b/kernel/sched/stats.c +@@ -22,8 +22,10 @@ static int show_schedstat(struct seq_file *seq, void *v) + } else { + struct rq *rq; + #ifdef CONFIG_SMP ++#ifndef CONFIG_SCHED_BMQ + struct sched_domain *sd; + int dcount = 0; ++#endif + #endif + cpu = (unsigned long)(v - 2); + rq = cpu_rq(cpu); +@@ -40,6 +42,7 @@ static int show_schedstat(struct seq_file *seq, void *v) + seq_printf(seq, "\n"); + + #ifdef CONFIG_SMP ++#ifndef CONFIG_SCHED_BMQ + /* domain-specific stats */ + rcu_read_lock(); + for_each_domain(cpu, sd) { +@@ -68,6 +71,7 @@ static int show_schedstat(struct seq_file *seq, void *v) + sd->ttwu_move_balance); + } + rcu_read_unlock(); ++#endif + #endif + } + return 0; +diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c +index 6ec1e595b1d4..f02bbaf837b3 100644 +--- a/kernel/sched/topology.c ++++ b/kernel/sched/topology.c +@@ -4,6 +4,7 @@ + */ + #include "sched.h" + ++#ifndef CONFIG_SCHED_BMQ + DEFINE_MUTEX(sched_domains_mutex); + + /* Protected by sched_domains_mutex: */ +@@ -1182,8 +1183,10 @@ static void init_sched_groups_capacity(int cpu, struct sched_domain *sd) + */ + + static int default_relax_domain_level = -1; ++#endif /* CONFIG_SCHED_BMQ */ + int sched_domain_level_max; + ++#ifndef CONFIG_SCHED_BMQ + static int __init setup_relax_domain_level(char *str) + { + if (kstrtoint(str, 0, &default_relax_domain_level)) +@@ -1425,6 +1428,7 @@ sd_init(struct sched_domain_topology_level *tl, + + return sd; + } ++#endif /* CONFIG_SCHED_BMQ */ + + /* + * Topology list, bottom-up. +@@ -1454,6 +1458,7 @@ void set_sched_topology(struct sched_domain_topology_level *tl) + sched_domain_topology = tl; + } + ++#ifndef CONFIG_SCHED_BMQ + #ifdef CONFIG_NUMA + + static const struct cpumask *sd_numa_mask(int cpu) +@@ -2289,3 +2294,17 @@ void partition_sched_domains(int ndoms_new, cpumask_var_t doms_new[], + partition_sched_domains_locked(ndoms_new, doms_new, dattr_new); + mutex_unlock(&sched_domains_mutex); + } ++#else /* CONFIG_SCHED_BMQ */ ++void partition_sched_domains(int ndoms_new, cpumask_var_t doms_new[], ++ struct sched_domain_attr *dattr_new) ++{} ++ ++#ifdef CONFIG_NUMA ++int __read_mostly node_reclaim_distance = RECLAIM_DISTANCE; ++ ++int sched_numa_find_closest(const struct cpumask *cpus, int cpu) ++{ ++ return best_mask_cpu(cpu, cpus); ++} ++#endif /* CONFIG_NUMA */ ++#endif +diff --git a/kernel/sysctl.c b/kernel/sysctl.c +index 70665934d53e..8d0157d9932e 100644 +--- a/kernel/sysctl.c ++++ b/kernel/sysctl.c +@@ -132,6 +132,10 @@ static unsigned long one_ul = 1; + static unsigned long long_max = LONG_MAX; + static int one_hundred = 100; + static int one_thousand = 1000; ++#ifdef CONFIG_SCHED_BMQ ++static int __maybe_unused zero = 0; ++extern int sched_yield_type; ++#endif + #ifdef CONFIG_PRINTK + static int ten_thousand = 10000; + #endif +@@ -300,7 +304,7 @@ static struct ctl_table sysctl_base_table[] = { + { } + }; + +-#ifdef CONFIG_SCHED_DEBUG ++#if defined(CONFIG_SCHED_DEBUG) && !defined(CONFIG_SCHED_BMQ) + static int min_sched_granularity_ns = 100000; /* 100 usecs */ + static int max_sched_granularity_ns = NSEC_PER_SEC; /* 1 second */ + static int min_wakeup_granularity_ns; /* 0 usecs */ +@@ -317,6 +321,7 @@ static int max_extfrag_threshold = 1000; + #endif + + static struct ctl_table kern_table[] = { ++#ifndef CONFIG_SCHED_BMQ + { + .procname = "sched_child_runs_first", + .data = &sysctl_sched_child_runs_first, +@@ -498,6 +503,7 @@ static struct ctl_table kern_table[] = { + .extra2 = SYSCTL_ONE, + }, + #endif ++#endif /* !CONFIG_SCHED_BMQ */ + #ifdef CONFIG_PROVE_LOCKING + { + .procname = "prove_locking", +@@ -1070,6 +1076,17 @@ static struct ctl_table kern_table[] = { + .proc_handler = proc_dointvec, + }, + #endif ++#ifdef CONFIG_SCHED_BMQ ++ { ++ .procname = "yield_type", ++ .data = &sched_yield_type, ++ .maxlen = sizeof (int), ++ .mode = 0644, ++ .proc_handler = &proc_dointvec_minmax, ++ .extra1 = &zero, ++ .extra2 = &two, ++ }, ++#endif + #if defined(CONFIG_S390) && defined(CONFIG_SMP) + { + .procname = "spin_retry", +diff --git a/kernel/time/posix-cpu-timers.c b/kernel/time/posix-cpu-timers.c +index 42d512fcfda2..70b97fe0ff44 100644 +--- a/kernel/time/posix-cpu-timers.c ++++ b/kernel/time/posix-cpu-timers.c +@@ -226,7 +226,7 @@ static void task_sample_cputime(struct task_struct *p, u64 *samples) + u64 stime, utime; + + task_cputime(p, &utime, &stime); +- store_samples(samples, stime, utime, p->se.sum_exec_runtime); ++ store_samples(samples, stime, utime, tsk_seruntime(p)); + } + + static void proc_sample_cputime_atomic(struct task_cputime_atomic *at, +@@ -796,6 +796,7 @@ static void collect_posix_cputimers(struct posix_cputimers *pct, u64 *samples, + } + } + ++#ifndef CONFIG_SCHED_BMQ + static inline void check_dl_overrun(struct task_struct *tsk) + { + if (tsk->dl.dl_overrun) { +@@ -803,6 +804,7 @@ static inline void check_dl_overrun(struct task_struct *tsk) + __group_send_sig_info(SIGXCPU, SEND_SIG_PRIV, tsk); + } + } ++#endif + + static bool check_rlimit(u64 time, u64 limit, int signo, bool rt, bool hard) + { +@@ -830,8 +832,10 @@ static void check_thread_timers(struct task_struct *tsk, + u64 samples[CPUCLOCK_MAX]; + unsigned long soft; + ++#ifndef CONFIG_SCHED_BMQ + if (dl_task(tsk)) + check_dl_overrun(tsk); ++#endif + + if (expiry_cache_is_inactive(pct)) + return; +@@ -845,7 +849,7 @@ static void check_thread_timers(struct task_struct *tsk, + soft = task_rlimit(tsk, RLIMIT_RTTIME); + if (soft != RLIM_INFINITY) { + /* Task RT timeout is accounted in jiffies. RTTIME is usec */ +- unsigned long rttime = tsk->rt.timeout * (USEC_PER_SEC / HZ); ++ unsigned long rttime = tsk_rttimeout(tsk) * (USEC_PER_SEC / HZ); + unsigned long hard = task_rlimit_max(tsk, RLIMIT_RTTIME); + + /* At the hard limit, send SIGKILL. No further action. */ +@@ -1099,8 +1103,10 @@ static inline bool fastpath_timer_check(struct task_struct *tsk) + return true; + } + ++#ifndef CONFIG_SCHED_BMQ + if (dl_task(tsk) && tsk->dl.dl_overrun) + return true; ++#endif + + return false; + } +diff --git a/kernel/trace/trace_selftest.c b/kernel/trace/trace_selftest.c +index 69ee8ef12cee..208788fcbb0e 100644 +--- a/kernel/trace/trace_selftest.c ++++ b/kernel/trace/trace_selftest.c +@@ -1048,10 +1048,15 @@ static int trace_wakeup_test_thread(void *data) + { + /* Make this a -deadline thread */ + static const struct sched_attr attr = { ++#ifdef CONFIG_SCHED_BMQ ++ /* No deadline on BMQ, use RR */ ++ .sched_policy = SCHED_RR, ++#else + .sched_policy = SCHED_DEADLINE, + .sched_runtime = 100000ULL, + .sched_deadline = 10000000ULL, + .sched_period = 10000000ULL ++#endif + }; + struct wakeup_test_data *x = data; + diff --git a/linux56-rc-tkg/linux56-tkg-patches/0009-glitched-bmq.patch b/linux56-rc-tkg/linux56-tkg-patches/0009-glitched-bmq.patch new file mode 100644 index 0000000..38666e4 --- /dev/null +++ b/linux56-rc-tkg/linux56-tkg-patches/0009-glitched-bmq.patch @@ -0,0 +1,90 @@ +From f7f49141a5dbe9c99d78196b58c44307fb2e6be3 Mon Sep 17 00:00:00 2001 +From: Tk-Glitch +Date: Wed, 4 Jul 2018 04:30:08 +0200 +Subject: glitched - BMQ + +diff --git a/kernel/Kconfig.hz b/kernel/Kconfig.hz +index 2a202a846757..1d9c7ed79b11 100644 +--- a/kernel/Kconfig.hz ++++ b/kernel/Kconfig.hz +@@ -4,7 +4,7 @@ + + choice + prompt "Timer frequency" +- default HZ_250 ++ default HZ_500 + help + Allows the configuration of the timer frequency. It is customary + to have the timer interrupt run at 1000 Hz but 100 Hz may be more +@@ -39,6 +39,13 @@ choice + on SMP and NUMA systems and exactly dividing by both PAL and + NTSC frame rates for video and multimedia work. + ++ config HZ_500 ++ bool "500 HZ" ++ help ++ 500 Hz is a balanced timer frequency. Provides fast interactivity ++ on desktops with great smoothness without increasing CPU power ++ consumption and sacrificing the battery life on laptops. ++ + config HZ_1000 + bool "1000 HZ" + help +@@ -52,6 +59,7 @@ config HZ + default 100 if HZ_100 + default 250 if HZ_250 + default 300 if HZ_300 ++ default 500 if HZ_500 + default 1000 if HZ_1000 + + config SCHED_HRTICK + +diff --git a/kernel/Kconfig.hz b/kernel/Kconfig.hz +index 2a202a846757..1d9c7ed79b11 100644 +--- a/kernel/Kconfig.hz ++++ b/kernel/Kconfig.hz +@@ -4,7 +4,7 @@ + + choice + prompt "Timer frequency" +- default HZ_500 ++ default HZ_750 + help + Allows the configuration of the timer frequency. It is customary + to have the timer interrupt run at 1000 Hz but 100 Hz may be more +@@ -46,6 +46,13 @@ choice + on desktops with great smoothness without increasing CPU power + consumption and sacrificing the battery life on laptops. + ++ config HZ_750 ++ bool "750 HZ" ++ help ++ 750 Hz is a good timer frequency for desktops. Provides fast ++ interactivity with great smoothness without sacrificing too ++ much throughput. ++ + config HZ_1000 + bool "1000 HZ" + help +@@ -60,6 +67,7 @@ config HZ + default 250 if HZ_250 + default 300 if HZ_300 + default 500 if HZ_500 ++ default 750 if HZ_750 + default 1000 if HZ_1000 + + config SCHED_HRTICK + +diff --git a/mm/vmscan.c b/mm/vmscan.c +index 9270a4370d54..30d01e647417 100644 +--- a/mm/vmscan.c ++++ b/mm/vmscan.c +@@ -159,7 +159,7 @@ struct scan_control { + /* + * From 0 .. 100. Higher means more swappy. + */ +-int vm_swappiness = 60; ++int vm_swappiness = 20; + /* + * The total number of pages which are beyond the high watermark within all + * zones. diff --git a/linux56-rc-tkg/linux56-tkg-patches/0009-glitched-ondemand-bmq.patch b/linux56-rc-tkg/linux56-tkg-patches/0009-glitched-ondemand-bmq.patch new file mode 100644 index 0000000..6e30518 --- /dev/null +++ b/linux56-rc-tkg/linux56-tkg-patches/0009-glitched-ondemand-bmq.patch @@ -0,0 +1,18 @@ +diff --git a/drivers/cpufreq/cpufreq_ondemand.c b/drivers/cpufreq/cpufreq_ondemand.c +index 6b423eebfd5d..61e3271675d6 100644 +--- a/drivers/cpufreq/cpufreq_ondemand.c ++++ b/drivers/cpufreq/cpufreq_ondemand.c +@@ -21,10 +21,10 @@ + #include "cpufreq_ondemand.h" + + /* On-demand governor macros */ +-#define DEF_FREQUENCY_UP_THRESHOLD (63) +-#define DEF_SAMPLING_DOWN_FACTOR (1) ++#define DEF_FREQUENCY_UP_THRESHOLD (55) ++#define DEF_SAMPLING_DOWN_FACTOR (5) + #define MAX_SAMPLING_DOWN_FACTOR (100000) +-#define MICRO_FREQUENCY_UP_THRESHOLD (95) ++#define MICRO_FREQUENCY_UP_THRESHOLD (63) + #define MICRO_FREQUENCY_MIN_SAMPLE_RATE (10000) + #define MIN_FREQUENCY_UP_THRESHOLD (1) + #define MAX_FREQUENCY_UP_THRESHOLD (100) diff --git a/linux56-rc-tkg/linux56-tkg-patches/0011-ZFS-fix.patch b/linux56-rc-tkg/linux56-tkg-patches/0011-ZFS-fix.patch new file mode 100644 index 0000000..af71d04 --- /dev/null +++ b/linux56-rc-tkg/linux56-tkg-patches/0011-ZFS-fix.patch @@ -0,0 +1,43 @@ +From 1e010beda2896bdf3082fb37a3e49f8ce20e04d8 Mon Sep 17 00:00:00 2001 +From: =?UTF-8?q?J=C3=B6rg=20Thalheim?= +Date: Thu, 2 May 2019 05:28:08 +0100 +Subject: [PATCH] x86/fpu: Export kernel_fpu_{begin,end}() with + EXPORT_SYMBOL_GPL +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +We need these symbols in zfs as the fpu implementation breaks userspace: + +https://github.com/zfsonlinux/zfs/issues/9346 +Signed-off-by: Jörg Thalheim +--- + arch/x86/kernel/fpu/core.c | 4 ++-- + 1 file changed, 2 insertions(+), 2 deletions(-) + +diff --git a/arch/x86/kernel/fpu/core.c b/arch/x86/kernel/fpu/core.c +index 12c70840980e..352538b3bb5d 100644 +--- a/arch/x86/kernel/fpu/core.c ++++ b/arch/x86/kernel/fpu/core.c +@@ -102,7 +102,7 @@ void kernel_fpu_begin(void) + } + __cpu_invalidate_fpregs_state(); + } +-EXPORT_SYMBOL_GPL(kernel_fpu_begin); ++EXPORT_SYMBOL(kernel_fpu_begin); + + void kernel_fpu_end(void) + { +@@ -111,7 +111,7 @@ void kernel_fpu_end(void) + this_cpu_write(in_kernel_fpu, false); + preempt_enable(); + } +-EXPORT_SYMBOL_GPL(kernel_fpu_end); ++EXPORT_SYMBOL(kernel_fpu_end); + + /* + * Save the FPU state (mark it for reload if necessary): +-- +2.23.0 + + diff --git a/linux56-rc-tkg/linux56-tkg-patches/0013-tp_smapi_ec.patch b/linux56-rc-tkg/linux56-tkg-patches/0013-tp_smapi_ec.patch new file mode 100644 index 0000000..59a9104 --- /dev/null +++ b/linux56-rc-tkg/linux56-tkg-patches/0013-tp_smapi_ec.patch @@ -0,0 +1,2415 @@ +diff --git a/drivers/input/mouse/synaptics.c b/drivers/input/mouse/synaptics.c +index 55d33500d55e..744e84228a1f 100644 +--- a/drivers/input/mouse/synaptics.c ++++ b/drivers/input/mouse/synaptics.c +@@ -1338,7 +1338,9 @@ static int set_input_params(struct psmouse *psmouse, + if (psmouse_matches_pnp_id(psmouse, topbuttonpad_pnp_ids) && + !SYN_CAP_EXT_BUTTONS_STICK(info->ext_cap_10)) + __set_bit(INPUT_PROP_TOPBUTTONPAD, dev->propbit); +- } ++ } else if (SYN_CAP_CLICKPAD2BTN(info->ext_cap_0c) || ++ SYN_CAP_CLICKPAD2BTN2(info->ext_cap_0c)) ++ __set_bit(INPUT_PROP_BUTTONPAD, dev->propbit); + + return 0; + } +diff --git a/drivers/input/mouse/synaptics.h b/drivers/input/mouse/synaptics.h +index fc00e005c611..4cfbeec3ae4c 100644 +--- a/drivers/input/mouse/synaptics.h ++++ b/drivers/input/mouse/synaptics.h +@@ -86,6 +86,7 @@ + */ + #define SYN_CAP_CLICKPAD(ex0c) ((ex0c) & BIT(20)) /* 1-button ClickPad */ + #define SYN_CAP_CLICKPAD2BTN(ex0c) ((ex0c) & BIT(8)) /* 2-button ClickPad */ ++#define SYN_CAP_CLICKPAD2BTN2(ex0c) ((ex0c) & BIT(21)) /* 2-button ClickPad */ + #define SYN_CAP_MAX_DIMENSIONS(ex0c) ((ex0c) & BIT(17)) + #define SYN_CAP_MIN_DIMENSIONS(ex0c) ((ex0c) & BIT(13)) + #define SYN_CAP_ADV_GESTURE(ex0c) ((ex0c) & BIT(19)) +diff --git a/drivers/macintosh/Kconfig b/drivers/macintosh/Kconfig +index 97a420c11eed..c8621e9b2e4a 100644 +--- a/drivers/macintosh/Kconfig ++++ b/drivers/macintosh/Kconfig +@@ -159,6 +159,13 @@ config INPUT_ADBHID + + If unsure, say Y. + ++config ADB_TRACKPAD_ABSOLUTE ++ bool "Enable absolute mode for adb trackpads" ++ depends on INPUT_ADBHID ++ help ++ Enable absolute mode in adb-base trackpads. This feature adds ++ compatibility with synaptics Xorg / Xfree drivers. ++ + config MAC_EMUMOUSEBTN + tristate "Support for mouse button 2+3 emulation" + depends on SYSCTL && INPUT +diff --git a/drivers/macintosh/adbhid.c b/drivers/macintosh/adbhid.c +index a261892c03b3..a85192de840c 100644 +--- a/drivers/macintosh/adbhid.c ++++ b/drivers/macintosh/adbhid.c +@@ -262,6 +262,15 @@ static struct adb_ids buttons_ids; + #define ADBMOUSE_MS_A3 8 /* Mouse systems A3 trackball (handler 3) */ + #define ADBMOUSE_MACALLY2 9 /* MacAlly 2-button mouse */ + ++#ifdef CONFIG_ADB_TRACKPAD_ABSOLUTE ++#define ABS_XMIN 310 ++#define ABS_XMAX 1700 ++#define ABS_YMIN 200 ++#define ABS_YMAX 1000 ++#define ABS_ZMIN 0 ++#define ABS_ZMAX 55 ++#endif ++ + static void + adbhid_keyboard_input(unsigned char *data, int nb, int apoll) + { +@@ -405,6 +414,9 @@ static void + adbhid_mouse_input(unsigned char *data, int nb, int autopoll) + { + int id = (data[0] >> 4) & 0x0f; ++#ifdef CONFIG_ADB_TRACKPAD_ABSOLUTE ++ int btn = 0; int x_axis = 0; int y_axis = 0; int z_axis = 0; ++#endif + + if (!adbhid[id]) { + pr_err("ADB HID on ID %d not yet registered\n", id); +@@ -436,6 +448,17 @@ adbhid_mouse_input(unsigned char *data, int nb, int autopoll) + high bits of y-axis motion. XY is additional + high bits of x-axis motion. + ++ For ADB Absolute motion protocol the data array will contain the ++ following values: ++ ++ BITS COMMENTS ++ data[0] = dddd 1100 ADB command: Talk, register 0, for device dddd. ++ data[1] = byyy yyyy Left button and y-axis motion. ++ data[2] = bxxx xxxx Second button and x-axis motion. ++ data[3] = 1yyy 1xxx Half bits of y-axis and x-axis motion. ++ data[4] = 1yyy 1xxx Higher bits of y-axis and x-axis motion. ++ data[5] = 1zzz 1zzz Higher and lower bits of z-pressure. ++ + MacAlly 2-button mouse protocol. + + For MacAlly 2-button mouse protocol the data array will contain the +@@ -458,8 +481,17 @@ adbhid_mouse_input(unsigned char *data, int nb, int autopoll) + switch (adbhid[id]->mouse_kind) + { + case ADBMOUSE_TRACKPAD: ++#ifdef CONFIG_ADB_TRACKPAD_ABSOLUTE ++ x_axis = (data[2] & 0x7f) | ((data[3] & 0x07) << 7) | ++ ((data[4] & 0x07) << 10); ++ y_axis = (data[1] & 0x7f) | ((data[3] & 0x70) << 3) | ++ ((data[4] & 0x70) << 6); ++ z_axis = (data[5] & 0x07) | ((data[5] & 0x70) >> 1); ++ btn = (!(data[1] >> 7)) & 1; ++#else + data[1] = (data[1] & 0x7f) | ((data[1] & data[2]) & 0x80); + data[2] = data[2] | 0x80; ++#endif + break; + case ADBMOUSE_MICROSPEED: + data[1] = (data[1] & 0x7f) | ((data[3] & 0x01) << 7); +@@ -485,17 +517,39 @@ adbhid_mouse_input(unsigned char *data, int nb, int autopoll) + break; + } + +- input_report_key(adbhid[id]->input, BTN_LEFT, !((data[1] >> 7) & 1)); +- input_report_key(adbhid[id]->input, BTN_MIDDLE, !((data[2] >> 7) & 1)); ++#ifdef CONFIG_ADB_TRACKPAD_ABSOLUTE ++ if ( adbhid[id]->mouse_kind == ADBMOUSE_TRACKPAD ) { + +- if (nb >= 4 && adbhid[id]->mouse_kind != ADBMOUSE_TRACKPAD) +- input_report_key(adbhid[id]->input, BTN_RIGHT, !((data[3] >> 7) & 1)); ++ if(z_axis > 30) input_report_key(adbhid[id]->input, BTN_TOUCH, 1); ++ if(z_axis < 25) input_report_key(adbhid[id]->input, BTN_TOUCH, 0); + +- input_report_rel(adbhid[id]->input, REL_X, +- ((data[2]&0x7f) < 64 ? (data[2]&0x7f) : (data[2]&0x7f)-128 )); +- input_report_rel(adbhid[id]->input, REL_Y, +- ((data[1]&0x7f) < 64 ? (data[1]&0x7f) : (data[1]&0x7f)-128 )); ++ if(z_axis > 0){ ++ input_report_abs(adbhid[id]->input, ABS_X, x_axis); ++ input_report_abs(adbhid[id]->input, ABS_Y, y_axis); ++ input_report_key(adbhid[id]->input, BTN_TOOL_FINGER, 1); ++ input_report_key(adbhid[id]->input, ABS_TOOL_WIDTH, 5); ++ } else { ++ input_report_key(adbhid[id]->input, BTN_TOOL_FINGER, 0); ++ input_report_key(adbhid[id]->input, ABS_TOOL_WIDTH, 0); ++ } ++ ++ input_report_abs(adbhid[id]->input, ABS_PRESSURE, z_axis); ++ input_report_key(adbhid[id]->input, BTN_LEFT, btn); ++ } else { ++#endif ++ input_report_key(adbhid[id]->input, BTN_LEFT, !((data[1] >> 7) & 1)); ++ input_report_key(adbhid[id]->input, BTN_MIDDLE, !((data[2] >> 7) & 1)); ++ ++ if (nb >= 4 && adbhid[id]->mouse_kind != ADBMOUSE_TRACKPAD) ++ input_report_key(adbhid[id]->input, BTN_RIGHT, !((data[3] >> 7) & 1)); + ++ input_report_rel(adbhid[id]->input, REL_X, ++ ((data[2]&0x7f) < 64 ? (data[2]&0x7f) : (data[2]&0x7f)-128 )); ++ input_report_rel(adbhid[id]->input, REL_Y, ++ ((data[1]&0x7f) < 64 ? (data[1]&0x7f) : (data[1]&0x7f)-128 )); ++#ifdef CONFIG_ADB_TRACKPAD_ABSOLUTE ++ } ++#endif + input_sync(adbhid[id]->input); + } + +@@ -849,6 +903,15 @@ adbhid_input_register(int id, int default_id, int original_handler_id, + input_dev->keybit[BIT_WORD(BTN_MOUSE)] = BIT_MASK(BTN_LEFT) | + BIT_MASK(BTN_MIDDLE) | BIT_MASK(BTN_RIGHT); + input_dev->relbit[0] = BIT_MASK(REL_X) | BIT_MASK(REL_Y); ++#ifdef CONFIG_ADB_TRACKPAD_ABSOLUTE ++ set_bit(EV_ABS, input_dev->evbit); ++ input_set_abs_params(input_dev, ABS_X, ABS_XMIN, ABS_XMAX, 0, 0); ++ input_set_abs_params(input_dev, ABS_Y, ABS_YMIN, ABS_YMAX, 0, 0); ++ input_set_abs_params(input_dev, ABS_PRESSURE, ABS_ZMIN, ABS_ZMAX, 0, 0); ++ set_bit(BTN_TOUCH, input_dev->keybit); ++ set_bit(BTN_TOOL_FINGER, input_dev->keybit); ++ set_bit(ABS_TOOL_WIDTH, input_dev->absbit); ++#endif + break; + + case ADB_MISC: +@@ -1132,7 +1195,11 @@ init_trackpad(int id) + r1_buffer[3], + r1_buffer[4], + r1_buffer[5], ++#ifdef CONFIG_ADB_TRACKPAD_ABSOLUTE ++ 0x00, /* Enable absolute mode */ ++#else + 0x03, /*r1_buffer[6],*/ ++#endif + r1_buffer[7]); + + /* Without this flush, the trackpad may be locked up */ +diff --git a/drivers/platform/x86/Kconfig b/drivers/platform/x86/Kconfig +index ac4d48830415..b272132ac742 100644 +--- a/drivers/platform/x86/Kconfig ++++ b/drivers/platform/x86/Kconfig +@@ -573,9 +573,28 @@ config THINKPAD_ACPI_HOTKEY_POLL + If you are not sure, say Y here. The driver enables polling only if + it is strictly necessary to do so. + ++config THINKPAD_EC ++ tristate ++ ---help--- ++ This is a low-level driver for accessing the ThinkPad H8S embedded ++ controller over the LPC bus (not to be confused with the ACPI Embedded ++ Controller interface). ++ ++config TP_SMAPI ++ tristate "ThinkPad SMAPI Support" ++ select THINKPAD_EC ++ default n ++ help ++ This adds SMAPI support on Lenovo/IBM ThinkPads, for features such ++ as battery charging control. For more information about this driver ++ see . ++ ++ If you have a Lenovo/IBM ThinkPad laptop, say Y or M here. ++ + config SENSORS_HDAPS + tristate "Thinkpad Hard Drive Active Protection System (hdaps)" + depends on INPUT ++ select THINKPAD_EC + help + This driver provides support for the IBM Hard Drive Active Protection + System (hdaps), which provides an accelerometer and other misc. data. +diff --git a/drivers/platform/x86/Makefile b/drivers/platform/x86/Makefile +index 2ba6cb795338..399f8b88646f 100644 +--- a/drivers/platform/x86/Makefile ++++ b/drivers/platform/x86/Makefile +@@ -35,6 +35,8 @@ obj-$(CONFIG_TC1100_WMI) += tc1100-wmi.o + obj-$(CONFIG_SONY_LAPTOP) += sony-laptop.o + obj-$(CONFIG_IDEAPAD_LAPTOP) += ideapad-laptop.o + obj-$(CONFIG_THINKPAD_ACPI) += thinkpad_acpi.o ++obj-$(CONFIG_THINKPAD_EC) += thinkpad_ec.o ++obj-$(CONFIG_TP_SMAPI) += tp_smapi.o + obj-$(CONFIG_SENSORS_HDAPS) += hdaps.o + obj-$(CONFIG_FUJITSU_LAPTOP) += fujitsu-laptop.o + obj-$(CONFIG_FUJITSU_TABLET) += fujitsu-tablet.o +diff --git a/drivers/platform/x86/thinkpad_ec.c b/drivers/platform/x86/thinkpad_ec.c +new file mode 100644 +index 000000000000..597614bc17e6 +--- /dev/null ++++ b/drivers/platform/x86/thinkpad_ec.c +@@ -0,0 +1,513 @@ ++/* ++ * thinkpad_ec.c - ThinkPad embedded controller LPC3 functions ++ * ++ * The embedded controller on ThinkPad laptops has a non-standard interface, ++ * where LPC channel 3 of the H8S EC chip is hooked up to IO ports ++ * 0x1600-0x161F and implements (a special case of) the H8S LPC protocol. ++ * The EC LPC interface provides various system management services (currently ++ * known: battery information and accelerometer readouts). This driver ++ * provides access and mutual exclusion for the EC interface. ++* ++ * The LPC protocol and terminology are documented here: ++ * "H8S/2104B Group Hardware Manual", ++ * http://documentation.renesas.com/eng/products/mpumcu/rej09b0300_2140bhm.pdf ++ * ++ * Copyright (C) 2006-2007 Shem Multinymous ++ * ++ * This program is free software; you can redistribute it and/or modify ++ * it under the terms of the GNU General Public License as published by ++ * the Free Software Foundation; either version 2 of the License, or ++ * (at your option) any later version. ++ * ++ * This program is distributed in the hope that it will be useful, ++ * but WITHOUT ANY WARRANTY; without even the implied warranty of ++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ++ * GNU General Public License for more details. ++ * ++ * You should have received a copy of the GNU General Public License ++ * along with this program; if not, write to the Free Software ++ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA ++ */ ++ ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++ ++#include ++#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,26) ++ #include ++#else ++ #include ++#endif ++ ++#define TP_VERSION "0.42" ++ ++MODULE_AUTHOR("Shem Multinymous"); ++MODULE_DESCRIPTION("ThinkPad embedded controller hardware access"); ++MODULE_VERSION(TP_VERSION); ++MODULE_LICENSE("GPL"); ++ ++/* IO ports used by embedded controller LPC channel 3: */ ++#define TPC_BASE_PORT 0x1600 ++#define TPC_NUM_PORTS 0x20 ++#define TPC_STR3_PORT 0x1604 /* Reads H8S EC register STR3 */ ++#define TPC_TWR0_PORT 0x1610 /* Mapped to H8S EC register TWR0MW/SW */ ++#define TPC_TWR15_PORT 0x161F /* Mapped to H8S EC register TWR15. */ ++ /* (and port TPC_TWR0_PORT+i is mapped to H8S reg TWRi for 00x%02x", \ ++ msg, args->val[0x0], args->val[0xF], code) ++ ++/* State of request prefetching: */ ++static u8 prefetch_arg0, prefetch_argF; /* Args of last prefetch */ ++static u64 prefetch_jiffies; /* time of prefetch, or: */ ++#define TPC_PREFETCH_NONE INITIAL_JIFFIES /* No prefetch */ ++#define TPC_PREFETCH_JUNK (INITIAL_JIFFIES+1) /* Ignore prefetch */ ++ ++/* Locking: */ ++#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,37) ++static DECLARE_MUTEX(thinkpad_ec_mutex); ++#else ++static DEFINE_SEMAPHORE(thinkpad_ec_mutex); ++#endif ++ ++/* Kludge in case the ACPI DSDT reserves the ports we need. */ ++static bool force_io; /* Willing to do IO to ports we couldn't reserve? */ ++static int reserved_io; /* Successfully reserved the ports? */ ++module_param_named(force_io, force_io, bool, 0600); ++MODULE_PARM_DESC(force_io, "Force IO even if region already reserved (0=off, 1=on)"); ++ ++/** ++ * thinkpad_ec_lock - get lock on the ThinkPad EC ++ * ++ * Get exclusive lock for accesing the ThinkPad embedded controller LPC3 ++ * interface. Returns 0 iff lock acquired. ++ */ ++int thinkpad_ec_lock(void) ++{ ++ int ret; ++ ret = down_interruptible(&thinkpad_ec_mutex); ++ return ret; ++} ++EXPORT_SYMBOL_GPL(thinkpad_ec_lock); ++ ++/** ++ * thinkpad_ec_try_lock - try getting lock on the ThinkPad EC ++ * ++ * Try getting an exclusive lock for accesing the ThinkPad embedded ++ * controller LPC3. Returns immediately if lock is not available; neither ++ * blocks nor sleeps. Returns 0 iff lock acquired . ++ */ ++int thinkpad_ec_try_lock(void) ++{ ++ return down_trylock(&thinkpad_ec_mutex); ++} ++EXPORT_SYMBOL_GPL(thinkpad_ec_try_lock); ++ ++/** ++ * thinkpad_ec_unlock - release lock on ThinkPad EC ++ * ++ * Release a previously acquired exclusive lock on the ThinkPad ebmedded ++ * controller LPC3 interface. ++ */ ++void thinkpad_ec_unlock(void) ++{ ++ up(&thinkpad_ec_mutex); ++} ++EXPORT_SYMBOL_GPL(thinkpad_ec_unlock); ++ ++/** ++ * thinkpad_ec_request_row - tell embedded controller to prepare a row ++ * @args Input register arguments ++ * ++ * Requests a data row by writing to H8S LPC registers TRW0 through TWR15 (or ++ * a subset thereof) following the protocol prescribed by the "H8S/2104B Group ++ * Hardware Manual". Does sanity checks via status register STR3. ++ */ ++static int thinkpad_ec_request_row(const struct thinkpad_ec_row *args) ++{ ++ u8 str3; ++ int i; ++ ++ /* EC protocol requires write to TWR0 (function code): */ ++ if (!(args->mask & 0x0001)) { ++ printk(KERN_ERR MSG_FMT("bad args->mask=0x%02x", args->mask)); ++ return -EINVAL; ++ } ++ ++ /* Check initial STR3 status: */ ++ str3 = inb(TPC_STR3_PORT) & H8S_STR3_MASK; ++ if (str3 & H8S_STR3_OBF3B) { /* data already pending */ ++ inb(TPC_TWR15_PORT); /* marks end of previous transaction */ ++ if (prefetch_jiffies == TPC_PREFETCH_NONE) ++ printk(KERN_WARNING REQ_FMT( ++ "EC has result from unrequested transaction", ++ str3)); ++ return -EBUSY; /* EC will be ready in a few usecs */ ++ } else if (str3 == H8S_STR3_SWMF) { /* busy with previous request */ ++ if (prefetch_jiffies == TPC_PREFETCH_NONE) ++ printk(KERN_WARNING REQ_FMT( ++ "EC is busy with unrequested transaction", ++ str3)); ++ return -EBUSY; /* data will be pending in a few usecs */ ++ } else if (str3 != 0x00) { /* unexpected status? */ ++ printk(KERN_WARNING REQ_FMT("unexpected initial STR3", str3)); ++ return -EIO; ++ } ++ ++ /* Send TWR0MW: */ ++ outb(args->val[0], TPC_TWR0_PORT); ++ str3 = inb(TPC_STR3_PORT) & H8S_STR3_MASK; ++ if (str3 != H8S_STR3_MWMF) { /* not accepted? */ ++ printk(KERN_WARNING REQ_FMT("arg0 rejected", str3)); ++ return -EIO; ++ } ++ ++ /* Send TWR1 through TWR14: */ ++ for (i = 1; i < TP_CONTROLLER_ROW_LEN-1; i++) ++ if ((args->mask>>i)&1) ++ outb(args->val[i], TPC_TWR0_PORT+i); ++ ++ /* Send TWR15 (default to 0x01). This marks end of command. */ ++ outb((args->mask & 0x8000) ? args->val[0xF] : 0x01, TPC_TWR15_PORT); ++ ++ /* Wait until EC starts writing its reply (~60ns on average). ++ * Releasing locks before this happens may cause an EC hang ++ * due to firmware bug! ++ */ ++ for (i = 0; i < TPC_REQUEST_RETRIES; i++) { ++ str3 = inb(TPC_STR3_PORT) & H8S_STR3_MASK; ++ if (str3 & H8S_STR3_SWMF) /* EC started replying */ ++ return 0; ++ else if (!(str3 & ~(H8S_STR3_IBF3B|H8S_STR3_MWMF))) ++ /* Normal progress (the EC hasn't seen the request ++ * yet, or is processing it). Wait it out. */ ++ ndelay(TPC_REQUEST_NDELAY); ++ else { /* weird EC status */ ++ printk(KERN_WARNING ++ REQ_FMT("bad end STR3", str3)); ++ return -EIO; ++ } ++ } ++ printk(KERN_WARNING REQ_FMT("EC is mysteriously silent", str3)); ++ return -EIO; ++} ++ ++/** ++ * thinkpad_ec_read_data - read pre-requested row-data from EC ++ * @args Input register arguments of pre-requested rows ++ * @data Output register values ++ * ++ * Reads current row data from the controller, assuming it's already ++ * requested. Follows the H8S spec for register access and status checks. ++ */ ++static int thinkpad_ec_read_data(const struct thinkpad_ec_row *args, ++ struct thinkpad_ec_row *data) ++{ ++ int i; ++ u8 str3 = inb(TPC_STR3_PORT) & H8S_STR3_MASK; ++ /* Once we make a request, STR3 assumes the sequence of values listed ++ * in the following 'if' as it reads the request and writes its data. ++ * It takes about a few dozen nanosecs total, with very high variance. ++ */ ++ if (str3 == (H8S_STR3_IBF3B|H8S_STR3_MWMF) || ++ str3 == 0x00 || /* the 0x00 is indistinguishable from idle EC! */ ++ str3 == H8S_STR3_SWMF) ++ return -EBUSY; /* not ready yet */ ++ /* Finally, the EC signals output buffer full: */ ++ if (str3 != (H8S_STR3_OBF3B|H8S_STR3_SWMF)) { ++ printk(KERN_WARNING ++ REQ_FMT("bad initial STR3", str3)); ++ return -EIO; ++ } ++ ++ /* Read first byte (signals start of read transactions): */ ++ data->val[0] = inb(TPC_TWR0_PORT); ++ /* Optionally read 14 more bytes: */ ++ for (i = 1; i < TP_CONTROLLER_ROW_LEN-1; i++) ++ if ((data->mask >> i)&1) ++ data->val[i] = inb(TPC_TWR0_PORT+i); ++ /* Read last byte from 0x161F (signals end of read transaction): */ ++ data->val[0xF] = inb(TPC_TWR15_PORT); ++ ++ /* Readout still pending? */ ++ str3 = inb(TPC_STR3_PORT) & H8S_STR3_MASK; ++ if (str3 & H8S_STR3_OBF3B) ++ printk(KERN_WARNING ++ REQ_FMT("OBF3B=1 after read", str3)); ++ /* If port 0x161F returns 0x80 too often, the EC may lock up. Warn: */ ++ if (data->val[0xF] == 0x80) ++ printk(KERN_WARNING ++ REQ_FMT("0x161F reports error", data->val[0xF])); ++ return 0; ++} ++ ++/** ++ * thinkpad_ec_is_row_fetched - is the given row currently prefetched? ++ * ++ * To keep things simple we compare only the first and last args; ++ * this suffices for all known cases. ++ */ ++static int thinkpad_ec_is_row_fetched(const struct thinkpad_ec_row *args) ++{ ++ return (prefetch_jiffies != TPC_PREFETCH_NONE) && ++ (prefetch_jiffies != TPC_PREFETCH_JUNK) && ++ (prefetch_arg0 == args->val[0]) && ++ (prefetch_argF == args->val[0xF]) && ++ (get_jiffies_64() < prefetch_jiffies + TPC_PREFETCH_TIMEOUT); ++} ++ ++/** ++ * thinkpad_ec_read_row - request and read data from ThinkPad EC ++ * @args Input register arguments ++ * @data Output register values ++ * ++ * Read a data row from the ThinkPad embedded controller LPC3 interface. ++ * Does fetching and retrying if needed. The row is specified by an ++ * array of 16 bytes, some of which may be undefined (but the first is ++ * mandatory). These bytes are given in @args->val[], where @args->val[i] is ++ * used iff (@args->mask>>i)&1). The resulting row data is stored in ++ * @data->val[], but is only guaranteed to be valid for indices corresponding ++ * to set bit in @data->mask. That is, if @data->mask&(1<val[i] is undefined. ++ * ++ * Returns -EBUSY on transient error and -EIO on abnormal condition. ++ * Caller must hold controller lock. ++ */ ++int thinkpad_ec_read_row(const struct thinkpad_ec_row *args, ++ struct thinkpad_ec_row *data) ++{ ++ int retries, ret; ++ ++ if (thinkpad_ec_is_row_fetched(args)) ++ goto read_row; /* already requested */ ++ ++ /* Request the row */ ++ for (retries = 0; retries < TPC_READ_RETRIES; ++retries) { ++ ret = thinkpad_ec_request_row(args); ++ if (!ret) ++ goto read_row; ++ if (ret != -EBUSY) ++ break; ++ ndelay(TPC_READ_NDELAY); ++ } ++ printk(KERN_ERR REQ_FMT("failed requesting row", ret)); ++ goto out; ++ ++read_row: ++ /* Read the row's data */ ++ for (retries = 0; retries < TPC_READ_RETRIES; ++retries) { ++ ret = thinkpad_ec_read_data(args, data); ++ if (!ret) ++ goto out; ++ if (ret != -EBUSY) ++ break; ++ ndelay(TPC_READ_NDELAY); ++ } ++ ++ printk(KERN_ERR REQ_FMT("failed waiting for data", ret)); ++ ++out: ++ prefetch_jiffies = TPC_PREFETCH_JUNK; ++ return ret; ++} ++EXPORT_SYMBOL_GPL(thinkpad_ec_read_row); ++ ++/** ++ * thinkpad_ec_try_read_row - try reading prefetched data from ThinkPad EC ++ * @args Input register arguments ++ * @data Output register values ++ * ++ * Try reading a data row from the ThinkPad embedded controller LPC3 ++ * interface, if this raw was recently prefetched using ++ * thinkpad_ec_prefetch_row(). Does not fetch, retry or block. ++ * The parameters have the same meaning as in thinkpad_ec_read_row(). ++ * ++ * Returns -EBUSY is data not ready and -ENODATA if row not prefetched. ++ * Caller must hold controller lock. ++ */ ++int thinkpad_ec_try_read_row(const struct thinkpad_ec_row *args, ++ struct thinkpad_ec_row *data) ++{ ++ int ret; ++ if (!thinkpad_ec_is_row_fetched(args)) { ++ ret = -ENODATA; ++ } else { ++ ret = thinkpad_ec_read_data(args, data); ++ if (!ret) ++ prefetch_jiffies = TPC_PREFETCH_NONE; /* eaten up */ ++ } ++ return ret; ++} ++EXPORT_SYMBOL_GPL(thinkpad_ec_try_read_row); ++ ++/** ++ * thinkpad_ec_prefetch_row - prefetch data from ThinkPad EC ++ * @args Input register arguments ++ * ++ * Prefetch a data row from the ThinkPad embedded controller LCP3 ++ * interface. A subsequent call to thinkpad_ec_read_row() with the ++ * same arguments will be faster, and a subsequent call to ++ * thinkpad_ec_try_read_row() stands a good chance of succeeding if ++ * done neither too soon nor too late. See ++ * thinkpad_ec_read_row() for the meaning of @args. ++ * ++ * Returns -EBUSY on transient error and -EIO on abnormal condition. ++ * Caller must hold controller lock. ++ */ ++int thinkpad_ec_prefetch_row(const struct thinkpad_ec_row *args) ++{ ++ int ret; ++ ret = thinkpad_ec_request_row(args); ++ if (ret) { ++ prefetch_jiffies = TPC_PREFETCH_JUNK; ++ } else { ++ prefetch_jiffies = get_jiffies_64(); ++ prefetch_arg0 = args->val[0x0]; ++ prefetch_argF = args->val[0xF]; ++ } ++ return ret; ++} ++EXPORT_SYMBOL_GPL(thinkpad_ec_prefetch_row); ++ ++/** ++ * thinkpad_ec_invalidate - invalidate prefetched ThinkPad EC data ++ * ++ * Invalidate the data prefetched via thinkpad_ec_prefetch_row() from the ++ * ThinkPad embedded controller LPC3 interface. ++ * Must be called before unlocking by any code that accesses the controller ++ * ports directly. ++ */ ++void thinkpad_ec_invalidate(void) ++{ ++ prefetch_jiffies = TPC_PREFETCH_JUNK; ++} ++EXPORT_SYMBOL_GPL(thinkpad_ec_invalidate); ++ ++ ++/*** Checking for EC hardware ***/ ++ ++/** ++ * thinkpad_ec_test - verify the EC is present and follows protocol ++ * ++ * Ensure the EC LPC3 channel really works on this machine by making ++ * an EC request and seeing if the EC follows the documented H8S protocol. ++ * The requested row just reads battery status, so it should be harmless to ++ * access it (on a correct EC). ++ * This test writes to IO ports, so execute only after checking DMI. ++ */ ++static int __init thinkpad_ec_test(void) ++{ ++ int ret; ++ const struct thinkpad_ec_row args = /* battery 0 basic status */ ++ { .mask = 0x8001, .val = {0x01,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0x00} }; ++ struct thinkpad_ec_row data = { .mask = 0x0000 }; ++ ret = thinkpad_ec_lock(); ++ if (ret) ++ return ret; ++ ret = thinkpad_ec_read_row(&args, &data); ++ thinkpad_ec_unlock(); ++ return ret; ++} ++ ++/* Search all DMI device names of a given type for a substring */ ++static int __init dmi_find_substring(int type, const char *substr) ++{ ++ const struct dmi_device *dev = NULL; ++ while ((dev = dmi_find_device(type, NULL, dev))) { ++ if (strstr(dev->name, substr)) ++ return 1; ++ } ++ return 0; ++} ++ ++#define TP_DMI_MATCH(vendor,model) { \ ++ .ident = vendor " " model, \ ++ .matches = { \ ++ DMI_MATCH(DMI_BOARD_VENDOR, vendor), \ ++ DMI_MATCH(DMI_PRODUCT_VERSION, model) \ ++ } \ ++} ++ ++/* Check DMI for existence of ThinkPad embedded controller */ ++static int __init check_dmi_for_ec(void) ++{ ++ /* A few old models that have a good EC but don't report it in DMI */ ++ struct dmi_system_id tp_whitelist[] = { ++ TP_DMI_MATCH("IBM", "ThinkPad A30"), ++ TP_DMI_MATCH("IBM", "ThinkPad T23"), ++ TP_DMI_MATCH("IBM", "ThinkPad X24"), ++ TP_DMI_MATCH("LENOVO", "ThinkPad"), ++ { .ident = NULL } ++ }; ++ return dmi_find_substring(DMI_DEV_TYPE_OEM_STRING, ++ "IBM ThinkPad Embedded Controller") || ++ dmi_check_system(tp_whitelist); ++} ++ ++/*** Init and cleanup ***/ ++ ++static int __init thinkpad_ec_init(void) ++{ ++ if (!check_dmi_for_ec()) { ++ printk(KERN_WARNING ++ "thinkpad_ec: no ThinkPad embedded controller!\n"); ++ return -ENODEV; ++ } ++ ++ if (request_region(TPC_BASE_PORT, TPC_NUM_PORTS, "thinkpad_ec")) { ++ reserved_io = 1; ++ } else { ++ printk(KERN_ERR "thinkpad_ec: cannot claim IO ports %#x-%#x... ", ++ TPC_BASE_PORT, ++ TPC_BASE_PORT + TPC_NUM_PORTS - 1); ++ if (force_io) { ++ printk("forcing use of unreserved IO ports.\n"); ++ } else { ++ printk("consider using force_io=1.\n"); ++ return -ENXIO; ++ } ++ } ++ prefetch_jiffies = TPC_PREFETCH_JUNK; ++ if (thinkpad_ec_test()) { ++ printk(KERN_ERR "thinkpad_ec: initial ec test failed\n"); ++ if (reserved_io) ++ release_region(TPC_BASE_PORT, TPC_NUM_PORTS); ++ return -ENXIO; ++ } ++ printk(KERN_INFO "thinkpad_ec: thinkpad_ec " TP_VERSION " loaded.\n"); ++ return 0; ++} ++ ++static void __exit thinkpad_ec_exit(void) ++{ ++ if (reserved_io) ++ release_region(TPC_BASE_PORT, TPC_NUM_PORTS); ++ printk(KERN_INFO "thinkpad_ec: unloaded.\n"); ++} ++ ++module_init(thinkpad_ec_init); ++module_exit(thinkpad_ec_exit); +diff --git a/drivers/platform/x86/tp_smapi.c b/drivers/platform/x86/tp_smapi.c +new file mode 100644 +index 000000000000..209cb6487e24 +--- /dev/null ++++ b/drivers/platform/x86/tp_smapi.c +@@ -0,0 +1,1493 @@ ++/* ++ * tp_smapi.c - ThinkPad SMAPI support ++ * ++ * This driver exposes some features of the System Management Application ++ * Program Interface (SMAPI) BIOS found on ThinkPad laptops. It works on ++ * models in which the SMAPI BIOS runs in SMM and is invoked by writing ++ * to the APM control port 0xB2. ++ * It also exposes battery status information, obtained from the ThinkPad ++ * embedded controller (via the thinkpad_ec module). ++ * Ancient ThinkPad models use a different interface, supported by the ++ * "thinkpad" module from "tpctl". ++ * ++ * Many of the battery status values obtained from the EC simply mirror ++ * values provided by the battery's Smart Battery System (SBS) interface, so ++ * their meaning is defined by the Smart Battery Data Specification (see ++ * http://sbs-forum.org/specs/sbdat110.pdf). References to this SBS spec ++ * are given in the code where relevant. ++ * ++ * Copyright (C) 2006 Shem Multinymous . ++ * SMAPI access code based on the mwave driver by Mike Sullivan. ++ * ++ * This program is free software; you can redistribute it and/or modify ++ * it under the terms of the GNU General Public License as published by ++ * the Free Software Foundation; either version 2 of the License, or ++ * (at your option) any later version. ++ * ++ * This program is distributed in the hope that it will be useful, ++ * but WITHOUT ANY WARRANTY; without even the implied warranty of ++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ++ * GNU General Public License for more details. ++ * ++ * You should have received a copy of the GNU General Public License ++ * along with this program; if not, write to the Free Software ++ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA ++ */ ++ ++#include ++#include ++#include ++#include ++#include ++#include /* CMOS defines */ ++#include ++#include ++#include ++#include ++#include ++#include ++ ++#define TP_VERSION "0.42" ++#define TP_DESC "ThinkPad SMAPI Support" ++#define TP_DIR "smapi" ++ ++MODULE_AUTHOR("Shem Multinymous"); ++MODULE_DESCRIPTION(TP_DESC); ++MODULE_VERSION(TP_VERSION); ++MODULE_LICENSE("GPL"); ++ ++static struct platform_device *pdev; ++ ++static int tp_debug; ++module_param_named(debug, tp_debug, int, 0600); ++MODULE_PARM_DESC(debug, "Debug level (0=off, 1=on)"); ++ ++/* A few macros for printk()ing: */ ++#define TPRINTK(level, fmt, args...) \ ++ dev_printk(level, &(pdev->dev), "%s: " fmt "\n", __func__, ## args) ++#define DPRINTK(fmt, args...) \ ++ do { if (tp_debug) TPRINTK(KERN_DEBUG, fmt, ## args); } while (0) ++ ++/********************************************************************* ++ * SMAPI interface ++ */ ++ ++/* SMAPI functions (register BX when making the SMM call). */ ++#define SMAPI_GET_INHIBIT_CHARGE 0x2114 ++#define SMAPI_SET_INHIBIT_CHARGE 0x2115 ++#define SMAPI_GET_THRESH_START 0x2116 ++#define SMAPI_SET_THRESH_START 0x2117 ++#define SMAPI_GET_FORCE_DISCHARGE 0x2118 ++#define SMAPI_SET_FORCE_DISCHARGE 0x2119 ++#define SMAPI_GET_THRESH_STOP 0x211a ++#define SMAPI_SET_THRESH_STOP 0x211b ++ ++/* SMAPI error codes (see ThinkPad 770 Technical Reference Manual p.83 at ++ http://www-307.ibm.com/pc/support/site.wss/document.do?lndocid=PFAN-3TUQQD */ ++#define SMAPI_RETCODE_EOF 0xff ++static struct { u8 rc; char *msg; int ret; } smapi_retcode[] = ++{ ++ {0x00, "OK", 0}, ++ {0x53, "SMAPI function is not available", -ENXIO}, ++ {0x81, "Invalid parameter", -EINVAL}, ++ {0x86, "Function is not supported by SMAPI BIOS", -EOPNOTSUPP}, ++ {0x90, "System error", -EIO}, ++ {0x91, "System is invalid", -EIO}, ++ {0x92, "System is busy, -EBUSY"}, ++ {0xa0, "Device error (disk read error)", -EIO}, ++ {0xa1, "Device is busy", -EBUSY}, ++ {0xa2, "Device is not attached", -ENXIO}, ++ {0xa3, "Device is disbled", -EIO}, ++ {0xa4, "Request parameter is out of range", -EINVAL}, ++ {0xa5, "Request parameter is not accepted", -EINVAL}, ++ {0xa6, "Transient error", -EBUSY}, /* ? */ ++ {SMAPI_RETCODE_EOF, "Unknown error code", -EIO} ++}; ++ ++ ++#define SMAPI_MAX_RETRIES 10 ++#define SMAPI_PORT2 0x4F /* fixed port, meaning unclear */ ++static unsigned short smapi_port; /* APM control port, normally 0xB2 */ ++ ++#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,37) ++static DECLARE_MUTEX(smapi_mutex); ++#else ++static DEFINE_SEMAPHORE(smapi_mutex); ++#endif ++ ++/** ++ * find_smapi_port - read SMAPI port from NVRAM ++ */ ++static int __init find_smapi_port(void) ++{ ++ u16 smapi_id = 0; ++ unsigned short port = 0; ++ unsigned long flags; ++ ++ spin_lock_irqsave(&rtc_lock, flags); ++ smapi_id = CMOS_READ(0x7C); ++ smapi_id |= (CMOS_READ(0x7D) << 8); ++ spin_unlock_irqrestore(&rtc_lock, flags); ++ ++ if (smapi_id != 0x5349) { ++ printk(KERN_ERR "SMAPI not supported (ID=0x%x)\n", smapi_id); ++ return -ENXIO; ++ } ++ spin_lock_irqsave(&rtc_lock, flags); ++ port = CMOS_READ(0x7E); ++ port |= (CMOS_READ(0x7F) << 8); ++ spin_unlock_irqrestore(&rtc_lock, flags); ++ if (port == 0) { ++ printk(KERN_ERR "unable to read SMAPI port number\n"); ++ return -ENXIO; ++ } ++ return port; ++} ++ ++/** ++ * smapi_request - make a SMAPI call ++ * @inEBX, @inECX, @inEDI, @inESI: input registers ++ * @outEBX, @outECX, @outEDX, @outEDI, @outESI: outputs registers ++ * @msg: textual error message ++ * Invokes the SMAPI SMBIOS with the given input and outpu args. ++ * All outputs are optional (can be %NULL). ++ * Returns 0 when successful, and a negative errno constant ++ * (see smapi_retcode above) upon failure. ++ */ ++static int smapi_request(u32 inEBX, u32 inECX, ++ u32 inEDI, u32 inESI, ++ u32 *outEBX, u32 *outECX, u32 *outEDX, ++ u32 *outEDI, u32 *outESI, const char **msg) ++{ ++ int ret = 0; ++ int i; ++ int retries; ++ u8 rc; ++ /* Must use local vars for output regs, due to reg pressure. */ ++ u32 tmpEAX, tmpEBX, tmpECX, tmpEDX, tmpEDI, tmpESI; ++ ++ for (retries = 0; retries < SMAPI_MAX_RETRIES; ++retries) { ++ DPRINTK("req_in: BX=%x CX=%x DI=%x SI=%x", ++ inEBX, inECX, inEDI, inESI); ++ ++ /* SMAPI's SMBIOS call and thinkpad_ec end up using use ++ * different interfaces to the same chip, so play it safe. */ ++ ret = thinkpad_ec_lock(); ++ if (ret) ++ return ret; ++ ++ __asm__ __volatile__( ++ "movl $0x00005380,%%eax\n\t" ++ "movl %6,%%ebx\n\t" ++ "movl %7,%%ecx\n\t" ++ "movl %8,%%edi\n\t" ++ "movl %9,%%esi\n\t" ++ "xorl %%edx,%%edx\n\t" ++ "movw %10,%%dx\n\t" ++ "out %%al,%%dx\n\t" /* trigger SMI to SMBIOS */ ++ "out %%al,$0x4F\n\t" ++ "movl %%eax,%0\n\t" ++ "movl %%ebx,%1\n\t" ++ "movl %%ecx,%2\n\t" ++ "movl %%edx,%3\n\t" ++ "movl %%edi,%4\n\t" ++ "movl %%esi,%5\n\t" ++ :"=m"(tmpEAX), ++ "=m"(tmpEBX), ++ "=m"(tmpECX), ++ "=m"(tmpEDX), ++ "=m"(tmpEDI), ++ "=m"(tmpESI) ++ :"m"(inEBX), "m"(inECX), "m"(inEDI), "m"(inESI), ++ "m"((u16)smapi_port) ++ :"%eax", "%ebx", "%ecx", "%edx", "%edi", ++ "%esi"); ++ ++ thinkpad_ec_invalidate(); ++ thinkpad_ec_unlock(); ++ ++ /* Don't let the next SMAPI access happen too quickly, ++ * may case problems. (We're hold smapi_mutex). */ ++ msleep(50); ++ ++ if (outEBX) *outEBX = tmpEBX; ++ if (outECX) *outECX = tmpECX; ++ if (outEDX) *outEDX = tmpEDX; ++ if (outESI) *outESI = tmpESI; ++ if (outEDI) *outEDI = tmpEDI; ++ ++ /* Look up error code */ ++ rc = (tmpEAX>>8)&0xFF; ++ for (i = 0; smapi_retcode[i].rc != SMAPI_RETCODE_EOF && ++ smapi_retcode[i].rc != rc; ++i) {} ++ ret = smapi_retcode[i].ret; ++ if (msg) ++ *msg = smapi_retcode[i].msg; ++ ++ DPRINTK("req_out: AX=%x BX=%x CX=%x DX=%x DI=%x SI=%x r=%d", ++ tmpEAX, tmpEBX, tmpECX, tmpEDX, tmpEDI, tmpESI, ret); ++ if (ret) ++ TPRINTK(KERN_NOTICE, "SMAPI error: %s (func=%x)", ++ smapi_retcode[i].msg, inEBX); ++ ++ if (ret != -EBUSY) ++ return ret; ++ } ++ return ret; ++} ++ ++/* Convenience wrapper: discard output arguments */ ++static int smapi_write(u32 inEBX, u32 inECX, ++ u32 inEDI, u32 inESI, const char **msg) ++{ ++ return smapi_request(inEBX, inECX, inEDI, inESI, ++ NULL, NULL, NULL, NULL, NULL, msg); ++} ++ ++ ++/********************************************************************* ++ * Specific SMAPI services ++ * All of these functions return 0 upon success, and a negative errno ++ * constant (see smapi_retcode) on failure. ++ */ ++ ++enum thresh_type { ++ THRESH_STOP = 0, /* the code assumes this is 0 for brevity */ ++ THRESH_START ++}; ++#define THRESH_NAME(which) ((which == THRESH_START) ? "start" : "stop") ++ ++/** ++ * __get_real_thresh - read battery charge start/stop threshold from SMAPI ++ * @bat: battery number (0 or 1) ++ * @which: THRESH_START or THRESH_STOP ++ * @thresh: 1..99, 0=default 1..99, 0=default (pass this as-is to SMAPI) ++ * @outEDI: some additional state that needs to be preserved, meaning unknown ++ * @outESI: some additional state that needs to be preserved, meaning unknown ++ */ ++static int __get_real_thresh(int bat, enum thresh_type which, int *thresh, ++ u32 *outEDI, u32 *outESI) ++{ ++ u32 ebx = (which == THRESH_START) ? SMAPI_GET_THRESH_START ++ : SMAPI_GET_THRESH_STOP; ++ u32 ecx = (bat+1)<<8; ++ const char *msg; ++ int ret = smapi_request(ebx, ecx, 0, 0, NULL, ++ &ecx, NULL, outEDI, outESI, &msg); ++ if (ret) { ++ TPRINTK(KERN_NOTICE, "cannot get %s_thresh of bat=%d: %s", ++ THRESH_NAME(which), bat, msg); ++ return ret; ++ } ++ if (!(ecx&0x00000100)) { ++ TPRINTK(KERN_NOTICE, "cannot get %s_thresh of bat=%d: ecx=0%x", ++ THRESH_NAME(which), bat, ecx); ++ return -EIO; ++ } ++ if (thresh) ++ *thresh = ecx&0xFF; ++ return 0; ++} ++ ++/** ++ * get_real_thresh - read battery charge start/stop threshold from SMAPI ++ * @bat: battery number (0 or 1) ++ * @which: THRESH_START or THRESH_STOP ++ * @thresh: 1..99, 0=default (passes as-is to SMAPI) ++ */ ++static int get_real_thresh(int bat, enum thresh_type which, int *thresh) ++{ ++ return __get_real_thresh(bat, which, thresh, NULL, NULL); ++} ++ ++/** ++ * set_real_thresh - write battery start/top charge threshold to SMAPI ++ * @bat: battery number (0 or 1) ++ * @which: THRESH_START or THRESH_STOP ++ * @thresh: 1..99, 0=default (passes as-is to SMAPI) ++ */ ++static int set_real_thresh(int bat, enum thresh_type which, int thresh) ++{ ++ u32 ebx = (which == THRESH_START) ? SMAPI_SET_THRESH_START ++ : SMAPI_SET_THRESH_STOP; ++ u32 ecx = ((bat+1)<<8) + thresh; ++ u32 getDI, getSI; ++ const char *msg; ++ int ret; ++ ++ /* verify read before writing */ ++ ret = __get_real_thresh(bat, which, NULL, &getDI, &getSI); ++ if (ret) ++ return ret; ++ ++ ret = smapi_write(ebx, ecx, getDI, getSI, &msg); ++ if (ret) ++ TPRINTK(KERN_NOTICE, "set %s to %d for bat=%d failed: %s", ++ THRESH_NAME(which), thresh, bat, msg); ++ else ++ TPRINTK(KERN_INFO, "set %s to %d for bat=%d", ++ THRESH_NAME(which), thresh, bat); ++ return ret; ++} ++ ++/** ++ * __get_inhibit_charge_minutes - get inhibit charge period from SMAPI ++ * @bat: battery number (0 or 1) ++ * @minutes: period in minutes (1..65535 minutes, 0=disabled) ++ * @outECX: some additional state that needs to be preserved, meaning unknown ++ * Note that @minutes is the originally set value, it does not count down. ++ */ ++static int __get_inhibit_charge_minutes(int bat, int *minutes, u32 *outECX) ++{ ++ u32 ecx = (bat+1)<<8; ++ u32 esi; ++ const char *msg; ++ int ret = smapi_request(SMAPI_GET_INHIBIT_CHARGE, ecx, 0, 0, ++ NULL, &ecx, NULL, NULL, &esi, &msg); ++ if (ret) { ++ TPRINTK(KERN_NOTICE, "failed for bat=%d: %s", bat, msg); ++ return ret; ++ } ++ if (!(ecx&0x0100)) { ++ TPRINTK(KERN_NOTICE, "bad ecx=0x%x for bat=%d", ecx, bat); ++ return -EIO; ++ } ++ if (minutes) ++ *minutes = (ecx&0x0001)?esi:0; ++ if (outECX) ++ *outECX = ecx; ++ return 0; ++} ++ ++/** ++ * get_inhibit_charge_minutes - get inhibit charge period from SMAPI ++ * @bat: battery number (0 or 1) ++ * @minutes: period in minutes (1..65535 minutes, 0=disabled) ++ * Note that @minutes is the originally set value, it does not count down. ++ */ ++static int get_inhibit_charge_minutes(int bat, int *minutes) ++{ ++ return __get_inhibit_charge_minutes(bat, minutes, NULL); ++} ++ ++/** ++ * set_inhibit_charge_minutes - write inhibit charge period to SMAPI ++ * @bat: battery number (0 or 1) ++ * @minutes: period in minutes (1..65535 minutes, 0=disabled) ++ */ ++static int set_inhibit_charge_minutes(int bat, int minutes) ++{ ++ u32 ecx; ++ const char *msg; ++ int ret; ++ ++ /* verify read before writing */ ++ ret = __get_inhibit_charge_minutes(bat, NULL, &ecx); ++ if (ret) ++ return ret; ++ ++ ecx = ((bat+1)<<8) | (ecx&0x00FE) | (minutes > 0 ? 0x0001 : 0x0000); ++ if (minutes > 0xFFFF) ++ minutes = 0xFFFF; ++ ret = smapi_write(SMAPI_SET_INHIBIT_CHARGE, ecx, 0, minutes, &msg); ++ if (ret) ++ TPRINTK(KERN_NOTICE, ++ "set to %d failed for bat=%d: %s", minutes, bat, msg); ++ else ++ TPRINTK(KERN_INFO, "set to %d for bat=%d\n", minutes, bat); ++ return ret; ++} ++ ++ ++/** ++ * get_force_discharge - get status of forced discharging from SMAPI ++ * @bat: battery number (0 or 1) ++ * @enabled: 1 if forced discharged is enabled, 0 if not ++ */ ++static int get_force_discharge(int bat, int *enabled) ++{ ++ u32 ecx = (bat+1)<<8; ++ const char *msg; ++ int ret = smapi_request(SMAPI_GET_FORCE_DISCHARGE, ecx, 0, 0, ++ NULL, &ecx, NULL, NULL, NULL, &msg); ++ if (ret) { ++ TPRINTK(KERN_NOTICE, "failed for bat=%d: %s", bat, msg); ++ return ret; ++ } ++ *enabled = (!(ecx&0x00000100) && (ecx&0x00000001))?1:0; ++ return 0; ++} ++ ++/** ++ * set_force_discharge - write status of forced discharging to SMAPI ++ * @bat: battery number (0 or 1) ++ * @enabled: 1 if forced discharged is enabled, 0 if not ++ */ ++static int set_force_discharge(int bat, int enabled) ++{ ++ u32 ecx = (bat+1)<<8; ++ const char *msg; ++ int ret = smapi_request(SMAPI_GET_FORCE_DISCHARGE, ecx, 0, 0, ++ NULL, &ecx, NULL, NULL, NULL, &msg); ++ if (ret) { ++ TPRINTK(KERN_NOTICE, "get failed for bat=%d: %s", bat, msg); ++ return ret; ++ } ++ if (ecx&0x00000100) { ++ TPRINTK(KERN_NOTICE, "cannot force discharge bat=%d", bat); ++ return -EIO; ++ } ++ ++ ecx = ((bat+1)<<8) | (ecx&0x000000FA) | (enabled?0x00000001:0); ++ ret = smapi_write(SMAPI_SET_FORCE_DISCHARGE, ecx, 0, 0, &msg); ++ if (ret) ++ TPRINTK(KERN_NOTICE, "set to %d failed for bat=%d: %s", ++ enabled, bat, msg); ++ else ++ TPRINTK(KERN_INFO, "set to %d for bat=%d", enabled, bat); ++ return ret; ++} ++ ++ ++/********************************************************************* ++ * Wrappers to threshold-related SMAPI functions, which handle default ++ * thresholds and related quirks. ++ */ ++ ++/* Minimum, default and minimum difference for battery charging thresholds: */ ++#define MIN_THRESH_DELTA 4 /* Min delta between start and stop thresh */ ++#define MIN_THRESH_START 2 ++#define MAX_THRESH_START (100-MIN_THRESH_DELTA) ++#define MIN_THRESH_STOP (MIN_THRESH_START + MIN_THRESH_DELTA) ++#define MAX_THRESH_STOP 100 ++#define DEFAULT_THRESH_START MAX_THRESH_START ++#define DEFAULT_THRESH_STOP MAX_THRESH_STOP ++ ++/* The GUI of IBM's Battery Maximizer seems to show a start threshold that ++ * is 1 more than the value we set/get via SMAPI. Since the threshold is ++ * maintained across reboot, this can be confusing. So we kludge our ++ * interface for interoperability: */ ++#define BATMAX_FIX 1 ++ ++/* Get charge start/stop threshold (1..100), ++ * substituting default values if needed and applying BATMAT_FIX. */ ++static int get_thresh(int bat, enum thresh_type which, int *thresh) ++{ ++ int ret = get_real_thresh(bat, which, thresh); ++ if (ret) ++ return ret; ++ if (*thresh == 0) ++ *thresh = (which == THRESH_START) ? DEFAULT_THRESH_START ++ : DEFAULT_THRESH_STOP; ++ else if (which == THRESH_START) ++ *thresh += BATMAX_FIX; ++ return 0; ++} ++ ++ ++/* Set charge start/stop threshold (1..100), ++ * substituting default values if needed and applying BATMAT_FIX. */ ++static int set_thresh(int bat, enum thresh_type which, int thresh) ++{ ++ if (which == THRESH_STOP && thresh == DEFAULT_THRESH_STOP) ++ thresh = 0; /* 100 is out of range, but default means 100 */ ++ if (which == THRESH_START) ++ thresh -= BATMAX_FIX; ++ return set_real_thresh(bat, which, thresh); ++} ++ ++/********************************************************************* ++ * ThinkPad embedded controller readout and basic functions ++ */ ++ ++/** ++ * read_tp_ec_row - read data row from the ThinkPad embedded controller ++ * @arg0: EC command code ++ * @bat: battery number, 0 or 1 ++ * @j: the byte value to be used for "junk" (unused) input/outputs ++ * @dataval: result vector ++ */ ++static int read_tp_ec_row(u8 arg0, int bat, u8 j, u8 *dataval) ++{ ++ int ret; ++ const struct thinkpad_ec_row args = { .mask = 0xFFFF, ++ .val = {arg0, j,j,j,j,j,j,j,j,j,j,j,j,j,j, (u8)bat} }; ++ struct thinkpad_ec_row data = { .mask = 0xFFFF }; ++ ++ ret = thinkpad_ec_lock(); ++ if (ret) ++ return ret; ++ ret = thinkpad_ec_read_row(&args, &data); ++ thinkpad_ec_unlock(); ++ memcpy(dataval, &data.val, TP_CONTROLLER_ROW_LEN); ++ return ret; ++} ++ ++/** ++ * power_device_present - check for presence of battery or AC power ++ * @bat: 0 for battery 0, 1 for battery 1, otherwise AC power ++ * Returns 1 if present, 0 if not present, negative if error. ++ */ ++static int power_device_present(int bat) ++{ ++ u8 row[TP_CONTROLLER_ROW_LEN]; ++ u8 test; ++ int ret = read_tp_ec_row(1, bat, 0, row); ++ if (ret) ++ return ret; ++ switch (bat) { ++ case 0: test = 0x40; break; /* battery 0 */ ++ case 1: test = 0x20; break; /* battery 1 */ ++ default: test = 0x80; /* AC power */ ++ } ++ return (row[0] & test) ? 1 : 0; ++} ++ ++/** ++ * bat_has_status - check if battery can report detailed status ++ * @bat: 0 for battery 0, 1 for battery 1 ++ * Returns 1 if yes, 0 if no, negative if error. ++ */ ++static int bat_has_status(int bat) ++{ ++ u8 row[TP_CONTROLLER_ROW_LEN]; ++ int ret = read_tp_ec_row(1, bat, 0, row); ++ if (ret) ++ return ret; ++ if ((row[0] & (bat?0x20:0x40)) == 0) /* no battery */ ++ return 0; ++ if ((row[1] & (0x60)) == 0) /* no status */ ++ return 0; ++ return 1; ++} ++ ++/** ++ * get_tp_ec_bat_16 - read a 16-bit value from EC battery status data ++ * @arg0: first argument to EC ++ * @off: offset in row returned from EC ++ * @bat: battery (0 or 1) ++ * @val: the 16-bit value obtained ++ * Returns nonzero on error. ++ */ ++static int get_tp_ec_bat_16(u8 arg0, int offset, int bat, u16 *val) ++{ ++ u8 row[TP_CONTROLLER_ROW_LEN]; ++ int ret; ++ if (bat_has_status(bat) != 1) ++ return -ENXIO; ++ ret = read_tp_ec_row(arg0, bat, 0, row); ++ if (ret) ++ return ret; ++ *val = *(u16 *)(row+offset); ++ return 0; ++} ++ ++/********************************************************************* ++ * sysfs attributes for batteries - ++ * definitions and helper functions ++ */ ++ ++/* A custom device attribute struct which holds a battery number */ ++struct bat_device_attribute { ++ struct device_attribute dev_attr; ++ int bat; ++}; ++ ++/** ++ * attr_get_bat - get the battery to which the attribute belongs ++ */ ++static int attr_get_bat(struct device_attribute *attr) ++{ ++ return container_of(attr, struct bat_device_attribute, dev_attr)->bat; ++} ++ ++/** ++ * show_tp_ec_bat_u16 - show an unsigned 16-bit battery attribute ++ * @arg0: specified 1st argument of EC raw to read ++ * @offset: byte offset in EC raw data ++ * @mul: correction factor to multiply by ++ * @na_msg: string to output is value not available (0xFFFFFFFF) ++ * @attr: battery attribute ++ * @buf: output buffer ++ * The 16-bit value is read from the EC, treated as unsigned, ++ * transformed as x->mul*x, and printed to the buffer. ++ * If the value is 0xFFFFFFFF and na_msg!=%NULL, na_msg is printed instead. ++ */ ++static ssize_t show_tp_ec_bat_u16(u8 arg0, int offset, int mul, ++ const char *na_msg, ++ struct device_attribute *attr, char *buf) ++{ ++ u16 val; ++ int ret = get_tp_ec_bat_16(arg0, offset, attr_get_bat(attr), &val); ++ if (ret) ++ return ret; ++ if (na_msg && val == 0xFFFF) ++ return sprintf(buf, "%s\n", na_msg); ++ else ++ return sprintf(buf, "%u\n", mul*(unsigned int)val); ++} ++ ++/** ++ * show_tp_ec_bat_s16 - show an signed 16-bit battery attribute ++ * @arg0: specified 1st argument of EC raw to read ++ * @offset: byte offset in EC raw data ++ * @mul: correction factor to multiply by ++ * @add: correction term to add after multiplication ++ * @attr: battery attribute ++ * @buf: output buffer ++ * The 16-bit value is read from the EC, treated as signed, ++ * transformed as x->mul*x+add, and printed to the buffer. ++ */ ++static ssize_t show_tp_ec_bat_s16(u8 arg0, int offset, int mul, int add, ++ struct device_attribute *attr, char *buf) ++{ ++ u16 val; ++ int ret = get_tp_ec_bat_16(arg0, offset, attr_get_bat(attr), &val); ++ if (ret) ++ return ret; ++ return sprintf(buf, "%d\n", mul*(s16)val+add); ++} ++ ++/** ++ * show_tp_ec_bat_str - show a string from EC battery status data ++ * @arg0: specified 1st argument of EC raw to read ++ * @offset: byte offset in EC raw data ++ * @maxlen: maximum string length ++ * @attr: battery attribute ++ * @buf: output buffer ++ */ ++static ssize_t show_tp_ec_bat_str(u8 arg0, int offset, int maxlen, ++ struct device_attribute *attr, char *buf) ++{ ++ int bat = attr_get_bat(attr); ++ u8 row[TP_CONTROLLER_ROW_LEN]; ++ int ret; ++ if (bat_has_status(bat) != 1) ++ return -ENXIO; ++ ret = read_tp_ec_row(arg0, bat, 0, row); ++ if (ret) ++ return ret; ++ strncpy(buf, (char *)row+offset, maxlen); ++ buf[maxlen] = 0; ++ strcat(buf, "\n"); ++ return strlen(buf); ++} ++ ++/** ++ * show_tp_ec_bat_power - show a power readout from EC battery status data ++ * @arg0: specified 1st argument of EC raw to read ++ * @offV: byte offset of voltage in EC raw data ++ * @offI: byte offset of current in EC raw data ++ * @attr: battery attribute ++ * @buf: output buffer ++ * Computes the power as current*voltage from the two given readout offsets. ++ */ ++static ssize_t show_tp_ec_bat_power(u8 arg0, int offV, int offI, ++ struct device_attribute *attr, char *buf) ++{ ++ u8 row[TP_CONTROLLER_ROW_LEN]; ++ int milliamp, millivolt, ret; ++ int bat = attr_get_bat(attr); ++ if (bat_has_status(bat) != 1) ++ return -ENXIO; ++ ret = read_tp_ec_row(1, bat, 0, row); ++ if (ret) ++ return ret; ++ millivolt = *(u16 *)(row+offV); ++ milliamp = *(s16 *)(row+offI); ++ return sprintf(buf, "%d\n", milliamp*millivolt/1000); /* units: mW */ ++} ++ ++/** ++ * show_tp_ec_bat_date - decode and show a date from EC battery status data ++ * @arg0: specified 1st argument of EC raw to read ++ * @offset: byte offset in EC raw data ++ * @attr: battery attribute ++ * @buf: output buffer ++ */ ++static ssize_t show_tp_ec_bat_date(u8 arg0, int offset, ++ struct device_attribute *attr, char *buf) ++{ ++ u8 row[TP_CONTROLLER_ROW_LEN]; ++ u16 v; ++ int ret; ++ int day, month, year; ++ int bat = attr_get_bat(attr); ++ if (bat_has_status(bat) != 1) ++ return -ENXIO; ++ ret = read_tp_ec_row(arg0, bat, 0, row); ++ if (ret) ++ return ret; ++ ++ /* Decode bit-packed: v = day | (month<<5) | ((year-1980)<<9) */ ++ v = *(u16 *)(row+offset); ++ day = v & 0x1F; ++ month = (v >> 5) & 0xF; ++ year = (v >> 9) + 1980; ++ ++ return sprintf(buf, "%04d-%02d-%02d\n", year, month, day); ++} ++ ++ ++/********************************************************************* ++ * sysfs attribute I/O for batteries - ++ * the actual attribute show/store functions ++ */ ++ ++static ssize_t show_battery_start_charge_thresh(struct device *dev, ++ struct device_attribute *attr, char *buf) ++{ ++ int thresh; ++ int bat = attr_get_bat(attr); ++ int ret = get_thresh(bat, THRESH_START, &thresh); ++ if (ret) ++ return ret; ++ return sprintf(buf, "%d\n", thresh); /* units: percent */ ++} ++ ++static ssize_t show_battery_stop_charge_thresh(struct device *dev, ++ struct device_attribute *attr, char *buf) ++{ ++ int thresh; ++ int bat = attr_get_bat(attr); ++ int ret = get_thresh(bat, THRESH_STOP, &thresh); ++ if (ret) ++ return ret; ++ return sprintf(buf, "%d\n", thresh); /* units: percent */ ++} ++ ++/** ++ * store_battery_start_charge_thresh - store battery_start_charge_thresh attr ++ * Since this is a kernel<->user interface, we ensure a valid state for ++ * the hardware. We do this by clamping the requested threshold to the ++ * valid range and, if necessary, moving the other threshold so that ++ * it's MIN_THRESH_DELTA away from this one. ++ */ ++static ssize_t store_battery_start_charge_thresh(struct device *dev, ++ struct device_attribute *attr, const char *buf, size_t count) ++{ ++ int thresh, other_thresh, ret; ++ int bat = attr_get_bat(attr); ++ ++ if (sscanf(buf, "%d", &thresh) != 1 || thresh < 1 || thresh > 100) ++ return -EINVAL; ++ ++ if (thresh < MIN_THRESH_START) /* clamp up to MIN_THRESH_START */ ++ thresh = MIN_THRESH_START; ++ if (thresh > MAX_THRESH_START) /* clamp down to MAX_THRESH_START */ ++ thresh = MAX_THRESH_START; ++ ++ down(&smapi_mutex); ++ ret = get_thresh(bat, THRESH_STOP, &other_thresh); ++ if (ret != -EOPNOTSUPP && ret != -ENXIO) { ++ if (ret) /* other threshold is set? */ ++ goto out; ++ ret = get_real_thresh(bat, THRESH_START, NULL); ++ if (ret) /* this threshold is set? */ ++ goto out; ++ if (other_thresh < thresh+MIN_THRESH_DELTA) { ++ /* move other thresh to keep it above this one */ ++ ret = set_thresh(bat, THRESH_STOP, ++ thresh+MIN_THRESH_DELTA); ++ if (ret) ++ goto out; ++ } ++ } ++ ret = set_thresh(bat, THRESH_START, thresh); ++out: ++ up(&smapi_mutex); ++ return count; ++ ++} ++ ++/** ++ * store_battery_stop_charge_thresh - store battery_stop_charge_thresh attr ++ * Since this is a kernel<->user interface, we ensure a valid state for ++ * the hardware. We do this by clamping the requested threshold to the ++ * valid range and, if necessary, moving the other threshold so that ++ * it's MIN_THRESH_DELTA away from this one. ++ */ ++static ssize_t store_battery_stop_charge_thresh(struct device *dev, ++ struct device_attribute *attr, const char *buf, size_t count) ++{ ++ int thresh, other_thresh, ret; ++ int bat = attr_get_bat(attr); ++ ++ if (sscanf(buf, "%d", &thresh) != 1 || thresh < 1 || thresh > 100) ++ return -EINVAL; ++ ++ if (thresh < MIN_THRESH_STOP) /* clamp up to MIN_THRESH_STOP */ ++ thresh = MIN_THRESH_STOP; ++ ++ down(&smapi_mutex); ++ ret = get_thresh(bat, THRESH_START, &other_thresh); ++ if (ret != -EOPNOTSUPP && ret != -ENXIO) { /* other threshold exists? */ ++ if (ret) ++ goto out; ++ /* this threshold exists? */ ++ ret = get_real_thresh(bat, THRESH_STOP, NULL); ++ if (ret) ++ goto out; ++ if (other_thresh >= thresh-MIN_THRESH_DELTA) { ++ /* move other thresh to be below this one */ ++ ret = set_thresh(bat, THRESH_START, ++ thresh-MIN_THRESH_DELTA); ++ if (ret) ++ goto out; ++ } ++ } ++ ret = set_thresh(bat, THRESH_STOP, thresh); ++out: ++ up(&smapi_mutex); ++ return count; ++} ++ ++static ssize_t show_battery_inhibit_charge_minutes(struct device *dev, ++ struct device_attribute *attr, char *buf) ++{ ++ int minutes; ++ int bat = attr_get_bat(attr); ++ int ret = get_inhibit_charge_minutes(bat, &minutes); ++ if (ret) ++ return ret; ++ return sprintf(buf, "%d\n", minutes); /* units: minutes */ ++} ++ ++static ssize_t store_battery_inhibit_charge_minutes(struct device *dev, ++ struct device_attribute *attr, ++ const char *buf, size_t count) ++{ ++ int ret; ++ int minutes; ++ int bat = attr_get_bat(attr); ++ if (sscanf(buf, "%d", &minutes) != 1 || minutes < 0) { ++ TPRINTK(KERN_ERR, "inhibit_charge_minutes: " ++ "must be a non-negative integer"); ++ return -EINVAL; ++ } ++ ret = set_inhibit_charge_minutes(bat, minutes); ++ if (ret) ++ return ret; ++ return count; ++} ++ ++static ssize_t show_battery_force_discharge(struct device *dev, ++ struct device_attribute *attr, char *buf) ++{ ++ int enabled; ++ int bat = attr_get_bat(attr); ++ int ret = get_force_discharge(bat, &enabled); ++ if (ret) ++ return ret; ++ return sprintf(buf, "%d\n", enabled); /* type: boolean */ ++} ++ ++static ssize_t store_battery_force_discharge(struct device *dev, ++ struct device_attribute *attr, const char *buf, size_t count) ++{ ++ int ret; ++ int enabled; ++ int bat = attr_get_bat(attr); ++ if (sscanf(buf, "%d", &enabled) != 1 || enabled < 0 || enabled > 1) ++ return -EINVAL; ++ ret = set_force_discharge(bat, enabled); ++ if (ret) ++ return ret; ++ return count; ++} ++ ++static ssize_t show_battery_installed( ++ struct device *dev, struct device_attribute *attr, char *buf) ++{ ++ int bat = attr_get_bat(attr); ++ int ret = power_device_present(bat); ++ if (ret < 0) ++ return ret; ++ return sprintf(buf, "%d\n", ret); /* type: boolean */ ++} ++ ++static ssize_t show_battery_state( ++ struct device *dev, struct device_attribute *attr, char *buf) ++{ ++ u8 row[TP_CONTROLLER_ROW_LEN]; ++ const char *txt; ++ int ret; ++ int bat = attr_get_bat(attr); ++ if (bat_has_status(bat) != 1) ++ return sprintf(buf, "none\n"); ++ ret = read_tp_ec_row(1, bat, 0, row); ++ if (ret) ++ return ret; ++ switch (row[1] & 0xf0) { ++ case 0xc0: txt = "idle"; break; ++ case 0xd0: txt = "discharging"; break; ++ case 0xe0: txt = "charging"; break; ++ default: return sprintf(buf, "unknown (0x%x)\n", row[1]); ++ } ++ return sprintf(buf, "%s\n", txt); /* type: string from fixed set */ ++} ++ ++static ssize_t show_battery_manufacturer( ++ struct device *dev, struct device_attribute *attr, char *buf) ++{ ++ /* type: string. SBS spec v1.1 p34: ManufacturerName() */ ++ return show_tp_ec_bat_str(4, 2, TP_CONTROLLER_ROW_LEN-2, attr, buf); ++} ++ ++static ssize_t show_battery_model( ++ struct device *dev, struct device_attribute *attr, char *buf) ++{ ++ /* type: string. SBS spec v1.1 p34: DeviceName() */ ++ return show_tp_ec_bat_str(5, 2, TP_CONTROLLER_ROW_LEN-2, attr, buf); ++} ++ ++static ssize_t show_battery_barcoding( ++ struct device *dev, struct device_attribute *attr, char *buf) ++{ ++ /* type: string */ ++ return show_tp_ec_bat_str(7, 2, TP_CONTROLLER_ROW_LEN-2, attr, buf); ++} ++ ++static ssize_t show_battery_chemistry( ++ struct device *dev, struct device_attribute *attr, char *buf) ++{ ++ /* type: string. SBS spec v1.1 p34-35: DeviceChemistry() */ ++ return show_tp_ec_bat_str(6, 2, 5, attr, buf); ++} ++ ++static ssize_t show_battery_voltage( ++ struct device *dev, struct device_attribute *attr, char *buf) ++{ ++ /* units: mV. SBS spec v1.1 p24: Voltage() */ ++ return show_tp_ec_bat_u16(1, 6, 1, NULL, attr, buf); ++} ++ ++static ssize_t show_battery_design_voltage( ++ struct device *dev, struct device_attribute *attr, char *buf) ++{ ++ /* units: mV. SBS spec v1.1 p32: DesignVoltage() */ ++ return show_tp_ec_bat_u16(3, 4, 1, NULL, attr, buf); ++} ++ ++static ssize_t show_battery_charging_max_voltage( ++ struct device *dev, struct device_attribute *attr, char *buf) ++{ ++ /* units: mV. SBS spec v1.1 p37,39: ChargingVoltage() */ ++ return show_tp_ec_bat_u16(9, 8, 1, NULL, attr, buf); ++} ++ ++static ssize_t show_battery_group0_voltage( ++ struct device *dev, struct device_attribute *attr, char *buf) ++{ ++ /* units: mV */ ++ return show_tp_ec_bat_u16(0xA, 12, 1, NULL, attr, buf); ++} ++ ++static ssize_t show_battery_group1_voltage( ++ struct device *dev, struct device_attribute *attr, char *buf) ++{ ++ /* units: mV */ ++ return show_tp_ec_bat_u16(0xA, 10, 1, NULL, attr, buf); ++} ++ ++static ssize_t show_battery_group2_voltage( ++ struct device *dev, struct device_attribute *attr, char *buf) ++{ ++ /* units: mV */ ++ return show_tp_ec_bat_u16(0xA, 8, 1, NULL, attr, buf); ++} ++ ++static ssize_t show_battery_group3_voltage( ++ struct device *dev, struct device_attribute *attr, char *buf) ++{ ++ /* units: mV */ ++ return show_tp_ec_bat_u16(0xA, 6, 1, NULL, attr, buf); ++} ++ ++static ssize_t show_battery_current_now( ++ struct device *dev, struct device_attribute *attr, char *buf) ++{ ++ /* units: mA. SBS spec v1.1 p24: Current() */ ++ return show_tp_ec_bat_s16(1, 8, 1, 0, attr, buf); ++} ++ ++static ssize_t show_battery_current_avg( ++ struct device *dev, struct device_attribute *attr, char *buf) ++{ ++ /* units: mA. SBS spec v1.1 p24: AverageCurrent() */ ++ return show_tp_ec_bat_s16(1, 10, 1, 0, attr, buf); ++} ++ ++static ssize_t show_battery_charging_max_current( ++ struct device *dev, struct device_attribute *attr, char *buf) ++{ ++ /* units: mA. SBS spec v1.1 p36,38: ChargingCurrent() */ ++ return show_tp_ec_bat_s16(9, 6, 1, 0, attr, buf); ++} ++ ++static ssize_t show_battery_power_now( ++ struct device *dev, struct device_attribute *attr, char *buf) ++{ ++ /* units: mW. SBS spec v1.1: Voltage()*Current() */ ++ return show_tp_ec_bat_power(1, 6, 8, attr, buf); ++} ++ ++static ssize_t show_battery_power_avg( ++ struct device *dev, struct device_attribute *attr, char *buf) ++{ ++ /* units: mW. SBS spec v1.1: Voltage()*AverageCurrent() */ ++ return show_tp_ec_bat_power(1, 6, 10, attr, buf); ++} ++ ++static ssize_t show_battery_remaining_percent( ++ struct device *dev, struct device_attribute *attr, char *buf) ++{ ++ /* units: percent. SBS spec v1.1 p25: RelativeStateOfCharge() */ ++ return show_tp_ec_bat_u16(1, 12, 1, NULL, attr, buf); ++} ++ ++static ssize_t show_battery_remaining_percent_error( ++ struct device *dev, struct device_attribute *attr, char *buf) ++{ ++ /* units: percent. SBS spec v1.1 p25: MaxError() */ ++ return show_tp_ec_bat_u16(9, 4, 1, NULL, attr, buf); ++} ++ ++static ssize_t show_battery_remaining_charging_time( ++ struct device *dev, struct device_attribute *attr, char *buf) ++{ ++ /* units: minutes. SBS spec v1.1 p27: AverageTimeToFull() */ ++ return show_tp_ec_bat_u16(2, 8, 1, "not_charging", attr, buf); ++} ++ ++static ssize_t show_battery_remaining_running_time( ++ struct device *dev, struct device_attribute *attr, char *buf) ++{ ++ /* units: minutes. SBS spec v1.1 p27: RunTimeToEmpty() */ ++ return show_tp_ec_bat_u16(2, 6, 1, "not_discharging", attr, buf); ++} ++ ++static ssize_t show_battery_remaining_running_time_now( ++ struct device *dev, struct device_attribute *attr, char *buf) ++{ ++ /* units: minutes. SBS spec v1.1 p27: RunTimeToEmpty() */ ++ return show_tp_ec_bat_u16(2, 4, 1, "not_discharging", attr, buf); ++} ++ ++static ssize_t show_battery_remaining_capacity( ++ struct device *dev, struct device_attribute *attr, char *buf) ++{ ++ /* units: mWh. SBS spec v1.1 p26. */ ++ return show_tp_ec_bat_u16(1, 14, 10, "", attr, buf); ++} ++ ++static ssize_t show_battery_last_full_capacity( ++ struct device *dev, struct device_attribute *attr, char *buf) ++{ ++ /* units: mWh. SBS spec v1.1 p26: FullChargeCapacity() */ ++ return show_tp_ec_bat_u16(2, 2, 10, "", attr, buf); ++} ++ ++static ssize_t show_battery_design_capacity( ++ struct device *dev, struct device_attribute *attr, char *buf) ++{ ++ /* units: mWh. SBS spec v1.1 p32: DesignCapacity() */ ++ return show_tp_ec_bat_u16(3, 2, 10, "", attr, buf); ++} ++ ++static ssize_t show_battery_cycle_count( ++ struct device *dev, struct device_attribute *attr, char *buf) ++{ ++ /* units: ordinal. SBS spec v1.1 p32: CycleCount() */ ++ return show_tp_ec_bat_u16(2, 12, 1, "", attr, buf); ++} ++ ++static ssize_t show_battery_temperature( ++ struct device *dev, struct device_attribute *attr, char *buf) ++{ ++ /* units: millicelsius. SBS spec v1.1: Temperature()*10 */ ++ return show_tp_ec_bat_s16(1, 4, 100, -273100, attr, buf); ++} ++ ++static ssize_t show_battery_serial( ++ struct device *dev, struct device_attribute *attr, char *buf) ++{ ++ /* type: int. SBS spec v1.1 p34: SerialNumber() */ ++ return show_tp_ec_bat_u16(3, 10, 1, "", attr, buf); ++} ++ ++static ssize_t show_battery_manufacture_date( ++ struct device *dev, struct device_attribute *attr, char *buf) ++{ ++ /* type: YYYY-MM-DD. SBS spec v1.1 p34: ManufactureDate() */ ++ return show_tp_ec_bat_date(3, 8, attr, buf); ++} ++ ++static ssize_t show_battery_first_use_date( ++ struct device *dev, struct device_attribute *attr, char *buf) ++{ ++ /* type: YYYY-MM-DD */ ++ return show_tp_ec_bat_date(8, 2, attr, buf); ++} ++ ++/** ++ * show_battery_dump - show the battery's dump attribute ++ * The dump attribute gives a hex dump of all EC readouts related to a ++ * battery. Some of the enumerated values don't really exist (i.e., the ++ * EC function just leaves them untouched); we use a kludge to detect and ++ * denote these. ++ */ ++#define MIN_DUMP_ARG0 0x00 ++#define MAX_DUMP_ARG0 0x0a /* 0x0b is useful too but hangs old EC firmware */ ++static ssize_t show_battery_dump( ++ struct device *dev, struct device_attribute *attr, char *buf) ++{ ++ int i; ++ char *p = buf; ++ int bat = attr_get_bat(attr); ++ u8 arg0; /* first argument to EC */ ++ u8 rowa[TP_CONTROLLER_ROW_LEN], ++ rowb[TP_CONTROLLER_ROW_LEN]; ++ const u8 junka = 0xAA, ++ junkb = 0x55; /* junk values for testing changes */ ++ int ret; ++ ++ for (arg0 = MIN_DUMP_ARG0; arg0 <= MAX_DUMP_ARG0; ++arg0) { ++ if ((p-buf) > PAGE_SIZE-TP_CONTROLLER_ROW_LEN*5) ++ return -ENOMEM; /* don't overflow sysfs buf */ ++ /* Read raw twice with different junk values, ++ * to detect unused output bytes which are left unchaged: */ ++ ret = read_tp_ec_row(arg0, bat, junka, rowa); ++ if (ret) ++ return ret; ++ ret = read_tp_ec_row(arg0, bat, junkb, rowb); ++ if (ret) ++ return ret; ++ for (i = 0; i < TP_CONTROLLER_ROW_LEN; i++) { ++ if (rowa[i] == junka && rowb[i] == junkb) ++ p += sprintf(p, "-- "); /* unused by EC */ ++ else ++ p += sprintf(p, "%02x ", rowa[i]); ++ } ++ p += sprintf(p, "\n"); ++ } ++ return p-buf; ++} ++ ++ ++/********************************************************************* ++ * sysfs attribute I/O, other than batteries ++ */ ++ ++static ssize_t show_ac_connected( ++ struct device *dev, struct device_attribute *attr, char *buf) ++{ ++ int ret = power_device_present(0xFF); ++ if (ret < 0) ++ return ret; ++ return sprintf(buf, "%d\n", ret); /* type: boolean */ ++} ++ ++/********************************************************************* ++ * The the "smapi_request" sysfs attribute executes a raw SMAPI call. ++ * You write to make a request and read to get the result. The state ++ * is saved globally rather than per fd (sysfs limitation), so ++ * simultaenous requests may get each other's results! So this is for ++ * development and debugging only. ++ */ ++#define MAX_SMAPI_ATTR_ANSWER_LEN 128 ++static char smapi_attr_answer[MAX_SMAPI_ATTR_ANSWER_LEN] = ""; ++ ++static ssize_t show_smapi_request(struct device *dev, ++ struct device_attribute *attr, char *buf) ++{ ++ int ret = snprintf(buf, PAGE_SIZE, "%s", smapi_attr_answer); ++ smapi_attr_answer[0] = '\0'; ++ return ret; ++} ++ ++static ssize_t store_smapi_request(struct device *dev, ++ struct device_attribute *attr, ++ const char *buf, size_t count) ++{ ++ unsigned int inEBX, inECX, inEDI, inESI; ++ u32 outEBX, outECX, outEDX, outEDI, outESI; ++ const char *msg; ++ int ret; ++ if (sscanf(buf, "%x %x %x %x", &inEBX, &inECX, &inEDI, &inESI) != 4) { ++ smapi_attr_answer[0] = '\0'; ++ return -EINVAL; ++ } ++ ret = smapi_request( ++ inEBX, inECX, inEDI, inESI, ++ &outEBX, &outECX, &outEDX, &outEDI, &outESI, &msg); ++ snprintf(smapi_attr_answer, MAX_SMAPI_ATTR_ANSWER_LEN, ++ "%x %x %x %x %x %d '%s'\n", ++ (unsigned int)outEBX, (unsigned int)outECX, ++ (unsigned int)outEDX, (unsigned int)outEDI, ++ (unsigned int)outESI, ret, msg); ++ if (ret) ++ return ret; ++ else ++ return count; ++} ++ ++/********************************************************************* ++ * Power management: the embedded controller forgets the battery ++ * thresholds when the system is suspended to disk and unplugged from ++ * AC and battery, so we restore it upon resume. ++ */ ++ ++static int saved_threshs[4] = {-1, -1, -1, -1}; /* -1 = don't know */ ++ ++static int tp_suspend(struct platform_device *dev, pm_message_t state) ++{ ++ int restore = (state.event == PM_EVENT_HIBERNATE || ++ state.event == PM_EVENT_FREEZE); ++ if (!restore || get_real_thresh(0, THRESH_STOP , &saved_threshs[0])) ++ saved_threshs[0] = -1; ++ if (!restore || get_real_thresh(0, THRESH_START, &saved_threshs[1])) ++ saved_threshs[1] = -1; ++ if (!restore || get_real_thresh(1, THRESH_STOP , &saved_threshs[2])) ++ saved_threshs[2] = -1; ++ if (!restore || get_real_thresh(1, THRESH_START, &saved_threshs[3])) ++ saved_threshs[3] = -1; ++ DPRINTK("suspend saved: %d %d %d %d", saved_threshs[0], ++ saved_threshs[1], saved_threshs[2], saved_threshs[3]); ++ return 0; ++} ++ ++static int tp_resume(struct platform_device *dev) ++{ ++ DPRINTK("resume restoring: %d %d %d %d", saved_threshs[0], ++ saved_threshs[1], saved_threshs[2], saved_threshs[3]); ++ if (saved_threshs[0] >= 0) ++ set_real_thresh(0, THRESH_STOP , saved_threshs[0]); ++ if (saved_threshs[1] >= 0) ++ set_real_thresh(0, THRESH_START, saved_threshs[1]); ++ if (saved_threshs[2] >= 0) ++ set_real_thresh(1, THRESH_STOP , saved_threshs[2]); ++ if (saved_threshs[3] >= 0) ++ set_real_thresh(1, THRESH_START, saved_threshs[3]); ++ return 0; ++} ++ ++ ++/********************************************************************* ++ * Driver model ++ */ ++ ++static struct platform_driver tp_driver = { ++ .suspend = tp_suspend, ++ .resume = tp_resume, ++ .driver = { ++ .name = "smapi", ++ .owner = THIS_MODULE ++ }, ++}; ++ ++ ++/********************************************************************* ++ * Sysfs device model ++ */ ++ ++/* Attributes in /sys/devices/platform/smapi/ */ ++ ++static DEVICE_ATTR(ac_connected, 0444, show_ac_connected, NULL); ++static DEVICE_ATTR(smapi_request, 0600, show_smapi_request, ++ store_smapi_request); ++ ++static struct attribute *tp_root_attributes[] = { ++ &dev_attr_ac_connected.attr, ++ &dev_attr_smapi_request.attr, ++ NULL ++}; ++static struct attribute_group tp_root_attribute_group = { ++ .attrs = tp_root_attributes ++}; ++ ++/* Attributes under /sys/devices/platform/smapi/BAT{0,1}/ : ++ * Every attribute needs to be defined (i.e., statically allocated) for ++ * each battery, and then referenced in the attribute list of each battery. ++ * We use preprocessor voodoo to avoid duplicating the list of attributes 4 ++ * times. The preprocessor output is just normal sysfs attributes code. ++ */ ++ ++/** ++ * FOREACH_BAT_ATTR - invoke the given macros on all our battery attributes ++ * @_BAT: battery number (0 or 1) ++ * @_ATTR_RW: macro to invoke for each read/write attribute ++ * @_ATTR_R: macro to invoke for each read-only attribute ++ */ ++#define FOREACH_BAT_ATTR(_BAT, _ATTR_RW, _ATTR_R) \ ++ _ATTR_RW(_BAT, start_charge_thresh) \ ++ _ATTR_RW(_BAT, stop_charge_thresh) \ ++ _ATTR_RW(_BAT, inhibit_charge_minutes) \ ++ _ATTR_RW(_BAT, force_discharge) \ ++ _ATTR_R(_BAT, installed) \ ++ _ATTR_R(_BAT, state) \ ++ _ATTR_R(_BAT, manufacturer) \ ++ _ATTR_R(_BAT, model) \ ++ _ATTR_R(_BAT, barcoding) \ ++ _ATTR_R(_BAT, chemistry) \ ++ _ATTR_R(_BAT, voltage) \ ++ _ATTR_R(_BAT, group0_voltage) \ ++ _ATTR_R(_BAT, group1_voltage) \ ++ _ATTR_R(_BAT, group2_voltage) \ ++ _ATTR_R(_BAT, group3_voltage) \ ++ _ATTR_R(_BAT, current_now) \ ++ _ATTR_R(_BAT, current_avg) \ ++ _ATTR_R(_BAT, charging_max_current) \ ++ _ATTR_R(_BAT, power_now) \ ++ _ATTR_R(_BAT, power_avg) \ ++ _ATTR_R(_BAT, remaining_percent) \ ++ _ATTR_R(_BAT, remaining_percent_error) \ ++ _ATTR_R(_BAT, remaining_charging_time) \ ++ _ATTR_R(_BAT, remaining_running_time) \ ++ _ATTR_R(_BAT, remaining_running_time_now) \ ++ _ATTR_R(_BAT, remaining_capacity) \ ++ _ATTR_R(_BAT, last_full_capacity) \ ++ _ATTR_R(_BAT, design_voltage) \ ++ _ATTR_R(_BAT, charging_max_voltage) \ ++ _ATTR_R(_BAT, design_capacity) \ ++ _ATTR_R(_BAT, cycle_count) \ ++ _ATTR_R(_BAT, temperature) \ ++ _ATTR_R(_BAT, serial) \ ++ _ATTR_R(_BAT, manufacture_date) \ ++ _ATTR_R(_BAT, first_use_date) \ ++ _ATTR_R(_BAT, dump) ++ ++/* Define several macros we will feed into FOREACH_BAT_ATTR: */ ++ ++#define DEFINE_BAT_ATTR_RW(_BAT,_NAME) \ ++ static struct bat_device_attribute dev_attr_##_NAME##_##_BAT = { \ ++ .dev_attr = __ATTR(_NAME, 0644, show_battery_##_NAME, \ ++ store_battery_##_NAME), \ ++ .bat = _BAT \ ++ }; ++ ++#define DEFINE_BAT_ATTR_R(_BAT,_NAME) \ ++ static struct bat_device_attribute dev_attr_##_NAME##_##_BAT = { \ ++ .dev_attr = __ATTR(_NAME, 0644, show_battery_##_NAME, 0), \ ++ .bat = _BAT \ ++ }; ++ ++#define REF_BAT_ATTR(_BAT,_NAME) \ ++ &dev_attr_##_NAME##_##_BAT.dev_attr.attr, ++ ++/* This provide all attributes for one battery: */ ++ ++#define PROVIDE_BAT_ATTRS(_BAT) \ ++ FOREACH_BAT_ATTR(_BAT, DEFINE_BAT_ATTR_RW, DEFINE_BAT_ATTR_R) \ ++ static struct attribute *tp_bat##_BAT##_attributes[] = { \ ++ FOREACH_BAT_ATTR(_BAT, REF_BAT_ATTR, REF_BAT_ATTR) \ ++ NULL \ ++ }; \ ++ static struct attribute_group tp_bat##_BAT##_attribute_group = { \ ++ .name = "BAT" #_BAT, \ ++ .attrs = tp_bat##_BAT##_attributes \ ++ }; ++ ++/* Finally genereate the attributes: */ ++ ++PROVIDE_BAT_ATTRS(0) ++PROVIDE_BAT_ATTRS(1) ++ ++/* List of attribute groups */ ++ ++static struct attribute_group *attr_groups[] = { ++ &tp_root_attribute_group, ++ &tp_bat0_attribute_group, ++ &tp_bat1_attribute_group, ++ NULL ++}; ++ ++ ++/********************************************************************* ++ * Init and cleanup ++ */ ++ ++static struct attribute_group **next_attr_group; /* next to register */ ++ ++static int __init tp_init(void) ++{ ++ int ret; ++ printk(KERN_INFO "tp_smapi " TP_VERSION " loading...\n"); ++ ++ ret = find_smapi_port(); ++ if (ret < 0) ++ goto err; ++ else ++ smapi_port = ret; ++ ++ if (!request_region(smapi_port, 1, "smapi")) { ++ printk(KERN_ERR "tp_smapi cannot claim port 0x%x\n", ++ smapi_port); ++ ret = -ENXIO; ++ goto err; ++ } ++ ++ if (!request_region(SMAPI_PORT2, 1, "smapi")) { ++ printk(KERN_ERR "tp_smapi cannot claim port 0x%x\n", ++ SMAPI_PORT2); ++ ret = -ENXIO; ++ goto err_port1; ++ } ++ ++ ret = platform_driver_register(&tp_driver); ++ if (ret) ++ goto err_port2; ++ ++ pdev = platform_device_alloc("smapi", -1); ++ if (!pdev) { ++ ret = -ENOMEM; ++ goto err_driver; ++ } ++ ++ ret = platform_device_add(pdev); ++ if (ret) ++ goto err_device_free; ++ ++ for (next_attr_group = attr_groups; *next_attr_group; ++ ++next_attr_group) { ++ ret = sysfs_create_group(&pdev->dev.kobj, *next_attr_group); ++ if (ret) ++ goto err_attr; ++ } ++ ++ printk(KERN_INFO "tp_smapi successfully loaded (smapi_port=0x%x).\n", ++ smapi_port); ++ return 0; ++ ++err_attr: ++ while (--next_attr_group >= attr_groups) ++ sysfs_remove_group(&pdev->dev.kobj, *next_attr_group); ++ platform_device_unregister(pdev); ++err_device_free: ++ platform_device_put(pdev); ++err_driver: ++ platform_driver_unregister(&tp_driver); ++err_port2: ++ release_region(SMAPI_PORT2, 1); ++err_port1: ++ release_region(smapi_port, 1); ++err: ++ printk(KERN_ERR "tp_smapi init failed (ret=%d)!\n", ret); ++ return ret; ++} ++ ++static void __exit tp_exit(void) ++{ ++ while (next_attr_group && --next_attr_group >= attr_groups) ++ sysfs_remove_group(&pdev->dev.kobj, *next_attr_group); ++ platform_device_unregister(pdev); ++ platform_driver_unregister(&tp_driver); ++ release_region(SMAPI_PORT2, 1); ++ if (smapi_port) ++ release_region(smapi_port, 1); ++ ++ printk(KERN_INFO "tp_smapi unloaded.\n"); ++} ++ ++module_init(tp_init); ++module_exit(tp_exit); +diff --git a/fs/exec.c b/fs/exec.c +index 65eaacaba4f4..1d3b310bd5f0 100644 +--- a/fs/exec.c ++++ b/fs/exec.c +@@ -63,6 +63,8 @@ + #include + #include + ++#include ++ + #include + #include + #include +@@ -866,9 +868,12 @@ static struct file *do_open_execat(int fd, struct filename *name, int flags) + if (err) + goto exit; + +- if (name->name[0] != '\0') ++ if (name->name[0] != '\0') { + fsnotify_open(file); + ++ trace_open_exec(name->name); ++ } ++ + out: + return file; + +diff --git a/fs/open.c b/fs/open.c +index cb81623a8b09..a92b0f6061ac 100644 +--- a/fs/open.c ++++ b/fs/open.c +@@ -34,6 +34,9 @@ + + #include "internal.h" + ++#define CREATE_TRACE_POINTS ++#include ++ + int do_truncate(struct dentry *dentry, loff_t length, unsigned int time_attrs, + struct file *filp) + { +@@ -1068,6 +1071,7 @@ long do_sys_open(int dfd, const char __user *filename, int flags, umode_t mode) + + long do_sys_open(int dfd, const char __user *filename, int flags, umode_t mode) + { ++ tmp = getname(filename); + struct open_how how = build_open_how(flags, mode); + return do_sys_openat2(dfd, filename, &how); ++ trace_do_sys_open(tmp->name, flags, mode); + } + + +diff --git a/include/trace/events/fs.h b/include/trace/events/fs.h +new file mode 100644 +index 000000000000..fb634b74adf3 +--- /dev/null ++++ b/include/trace/events/fs.h +@@ -0,0 +1,53 @@ ++#undef TRACE_SYSTEM ++#define TRACE_SYSTEM fs ++ ++#if !defined(_TRACE_FS_H) || defined(TRACE_HEADER_MULTI_READ) ++#define _TRACE_FS_H ++ ++#include ++#include ++ ++TRACE_EVENT(do_sys_open, ++ ++ TP_PROTO(const char *filename, int flags, int mode), ++ ++ TP_ARGS(filename, flags, mode), ++ ++ TP_STRUCT__entry( ++ __string( filename, filename ) ++ __field( int, flags ) ++ __field( int, mode ) ++ ), ++ ++ TP_fast_assign( ++ __assign_str(filename, filename); ++ __entry->flags = flags; ++ __entry->mode = mode; ++ ), ++ ++ TP_printk("\"%s\" %x %o", ++ __get_str(filename), __entry->flags, __entry->mode) ++); ++ ++TRACE_EVENT(open_exec, ++ ++ TP_PROTO(const char *filename), ++ ++ TP_ARGS(filename), ++ ++ TP_STRUCT__entry( ++ __string( filename, filename ) ++ ), ++ ++ TP_fast_assign( ++ __assign_str(filename, filename); ++ ), ++ ++ TP_printk("\"%s\"", ++ __get_str(filename)) ++); ++ ++#endif /* _TRACE_FS_H */ ++ ++/* This part must be outside protection */ ++#include +diff --git a/include/linux/thinkpad_ec.h b/include/linux/thinkpad_ec.h +new file mode 100644 +index 000000000000..1b80d7ee5493 +--- /dev/null ++++ b/include/linux/thinkpad_ec.h +@@ -0,0 +1,47 @@ ++/* ++ * thinkpad_ec.h - interface to ThinkPad embedded controller LPC3 functions ++ * ++ * Copyright (C) 2005 Shem Multinymous ++ * ++ * This program is free software; you can redistribute it and/or modify ++ * it under the terms of the GNU General Public License as published by ++ * the Free Software Foundation; either version 2 of the License, or ++ * (at your option) any later version. ++ * ++ * This program is distributed in the hope that it will be useful, ++ * but WITHOUT ANY WARRANTY; without even the implied warranty of ++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ++ * GNU General Public License for more details. ++ * ++ * You should have received a copy of the GNU General Public License ++ * along with this program; if not, write to the Free Software ++ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA ++ */ ++ ++#ifndef _THINKPAD_EC_H ++#define _THINKPAD_EC_H ++ ++#ifdef __KERNEL__ ++ ++#define TP_CONTROLLER_ROW_LEN 16 ++ ++/* EC transactions input and output (possibly partial) vectors of 16 bytes. */ ++struct thinkpad_ec_row { ++ u16 mask; /* bitmap of which entries of val[] are meaningful */ ++ u8 val[TP_CONTROLLER_ROW_LEN]; ++}; ++ ++extern int __must_check thinkpad_ec_lock(void); ++extern int __must_check thinkpad_ec_try_lock(void); ++extern void thinkpad_ec_unlock(void); ++ ++extern int thinkpad_ec_read_row(const struct thinkpad_ec_row *args, ++ struct thinkpad_ec_row *data); ++extern int thinkpad_ec_try_read_row(const struct thinkpad_ec_row *args, ++ struct thinkpad_ec_row *mask); ++extern int thinkpad_ec_prefetch_row(const struct thinkpad_ec_row *args); ++extern void thinkpad_ec_invalidate(void); ++ ++ ++#endif /* __KERNEL */ ++#endif /* _THINKPAD_EC_H */